369 files changed, 10972 insertions, 7789 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
 extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
+const struct file_operations v9fs_dir_operations_dotl = {
+        .read = generic_read_dir,
+        .llseek = generic_file_llseek,
+        .readdir = v9fs_dir_readdir,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..25b300e1c9d7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -296,3 +296,14 @@ const struct file_operations v9fs_file_operations = {
        .mmap = generic_file_readonly_mmap,
        .fsync = v9fs_file_fsync,
 };
+const struct file_operations v9fs_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_file_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f2434fc9d2c4..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
 #include "cache.h"
 static const struct inode_operations v9fs_dir_inode_operations;
-static const struct inode_operations v9fs_dir_inode_operations_ext;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -253,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                return ERR_PTR(-ENOMEM);
        }
-        inode->i_mode = mode;
+        inode_init_owner(inode, NULL, mode);
-        inode->i_uid = current_fsuid();
-        inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_rdev = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -275,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
                break;
        case S_IFREG:
-                inode->i_op = &v9fs_file_inode_operations;
+                if (v9fs_proto_dotl(v9ses)) {
-                inode->i_fop = &v9fs_file_operations;
+                        inode->i_op = &v9fs_file_inode_operations_dotl;
+                        inode->i_fop = &v9fs_file_operations_dotl;
+                } else {
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                }
                break;
        case S_IFLNK:
-                if (!v9fs_proto_dotu(v9ses)) {
+                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
-                                   "extended modes used w/o 9P2000.u\n");
+                                                "legacy protocol.\n");
                        err = -EINVAL;
                        goto error;
                }
-                inode->i_op = &v9fs_symlink_inode_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_op = &v9fs_symlink_inode_operations_dotl;
+                else
+                        inode->i_op = &v9fs_symlink_inode_operations;
                break;
        case S_IFDIR:
                inc_nlink(inode);
-                if (v9fs_proto_dotu(v9ses))
+                if (v9fs_proto_dotl(v9ses))
-                        inode->i_op = &v9fs_dir_inode_operations_ext;
+                        inode->i_op = &v9fs_dir_inode_operations_dotl;
+                else if (v9fs_proto_dotu(v9ses))
+                        inode->i_op = &v9fs_dir_inode_operations_dotu;
                else
                        inode->i_op = &v9fs_dir_inode_operations;
-                inode->i_fop = &v9fs_dir_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_fop = &v9fs_dir_operations_dotl;
+                else
+                        inode->i_fop = &v9fs_dir_operations;
                break;
        default:
                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -434,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
        struct inode *file_inode;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *v9fid;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
        file_inode = file->d_inode;
-        v9ses = v9fs_inode2v9ses(file_inode);
        v9fid = v9fs_fid_clone(file);
        if (IS_ERR(v9fid))
                return PTR_ERR(v9fid);
@@ -484,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        ofid = NULL;
        fid = NULL;
        name = (char *) dentry->d_name.name;
-        dfid = v9fs_fid_clone(dentry->d_parent);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        /* clone a fid to use for creation */
@@ -497,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        if (IS_ERR(ofid)) {
                err = PTR_ERR(ofid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                ofid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -508,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* now walk from the parent so we can get unopened fid */
-        fid = p9_client_walk(dfid, 1, &name, 0);
+        fid = p9_client_walk(dfid, 1, &name, 1);
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                fid = NULL;
                goto error;
-        } else
+        }
-                dfid = NULL;
        /* instantiate inode and assign the unopened fid to the dentry */
        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -538,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        return ofid;
 error:
-        if (dfid)
-                p9_client_clunk(dfid);
        if (ofid)
                p9_client_clunk(ofid);
@@ -675,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid)) {
                result = PTR_ERR(fid);
                if (result == -ENOENT) {
-                        d_add(dentry, NULL);
+                        inode = NULL;
-                        return NULL;
+                        goto inst_out;
                }
                return ERR_PTR(result);
@@ -693,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (result < 0)
                goto error;
-        if ((fid->qid.version) && (v9ses->cache))
+inst_out:
+        if (v9ses->cache)
                dentry->d_op = &v9fs_cached_dentry_operations;
        else
                dentry->d_op = &v9fs_dentry_operations;
@@ -772,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto clunk_olddir;
        }
+        if (v9fs_proto_dotl(v9ses)) {
+                retval = p9_client_rename(oldfid, newdirfid,
+                                        (char *) new_dentry->d_name.name);
+                if (retval != -ENOSYS)
+                        goto clunk_newdir;
+        }
        /* 9P can only handle file rename in the same directory */
        if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
                P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1197,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
                sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
        else if (S_ISFIFO(mode))
                *name = 0;
+        else if (S_ISSOCK(mode))
+                *name = 0;
        else {
                __putname(name);
                return -EINVAL;
@@ -1208,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-static const struct inode_operations v9fs_dir_inode_operations_ext = {
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+        .create = v9fs_vfs_create,
+        .lookup = v9fs_vfs_lookup,
+        .symlink = v9fs_vfs_symlink,
+        .link = v9fs_vfs_link,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
+static const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
        .symlink = v9fs_vfs_symlink,
@@ -1239,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -1246,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/statfs.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -45,7 +46,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
-static const struct super_operations v9fs_super_ops;
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 /**
 * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
-        sb->s_op = &v9fs_super_ops;
+        if (v9fs_proto_dotl(v9ses))
+                sb->s_op = &v9fs_super_ops_dotl;
+        else
+                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
        v9fs_session_begin_cancel(v9ses);
 }
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_rstatfs rs;
+        int res;
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                res = PTR_ERR(fid);
+                goto done;
+        }
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9fs_proto_dotl(v9ses)) {
+                res = p9_client_statfs(fid, &rs);
+                if (res == 0) {
+                        buf->f_type = rs.type;
+                        buf->f_bsize = rs.bsize;
+                        buf->f_blocks = rs.blocks;
+                        buf->f_bfree = rs.bfree;
+                        buf->f_bavail = rs.bavail;
+                        buf->f_files = rs.files;
+                        buf->f_ffree = rs.ffree;
+                        buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+                        buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+                        buf->f_namelen = rs.namelen;
+                }
+                if (res != -ENOSYS)
+                        goto done;
+        }
+        res = simple_statfs(dentry, buf);
+done:
+        return res;
+}
 static const struct super_operations v9fs_super_ops = {
 #ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
        .umount_begin = v9fs_umount_begin,
 };
+static const struct super_operations v9fs_super_ops_dotl = {
+#ifdef CONFIG_9P_FSCACHE
+        .alloc_inode = v9fs_alloc_inode,
+        .destroy_inode = v9fs_destroy_inode,
+#endif
+        .statfs = v9fs_statfs,
+        .clear_inode = v9fs_clear_inode,
+        .show_options = generic_show_options,
+        .umount_begin = v9fs_umount_begin,
+};
 struct file_system_type v9fs_fs_type = {
        .name = "9p",
        .get_sb = v9fs_get_sb,
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
-                stack.o fs_struct.o
+                stack.o fs_struct.o statfs.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=        buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b57..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
                                     struct key *key)
 {
        struct page *page;
-        struct file file = {
-                .private_data = key,
-        };
        _enter("{%lu},%lu", dir->i_ino, index);
-        page = read_mapping_page(dir->i_mapping, index, &file);
+        page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
        if (!IS_ERR(page)) {
                kmap(page);
                if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724d..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
 #endif
 /*
- * AFS read page from file, directory or symlink
+ * read page from file, directory or symlink, given a key to use
 */
-static int afs_readpage(struct file *file, struct page *page)
+int afs_page_filler(void *data, struct page *page)
 {
-        struct afs_vnode *vnode;
+        struct inode *inode = page->mapping->host;
-        struct inode *inode;
+        struct afs_vnode *vnode = AFS_FS_I(inode);
-        struct key *key;
+        struct key *key = data;
        size_t len;
        off_t offset;
        int ret;
-        inode = page->mapping->host;
-        if (file) {
-                key = file->private_data;
-                ASSERT(key != NULL);
-        } else {
-                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
-                if (IS_ERR(key)) {
-                        ret = PTR_ERR(key);
-                        goto error_nokey;
-                }
-        }
        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
-        vnode = AFS_FS_I(inode);
        BUG_ON(!PageLocked(page));
        ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
                unlock_page(page);
        }
-        if (!file)
-                key_put(key);
        _leave(" = 0");
        return 0;
 error:
        SetPageError(page);
        unlock_page(page);
-        if (!file)
-                key_put(key);
-error_nokey:
        _leave(" = %d", ret);
        return ret;
 }
 /*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+        struct key *key;
+        int ret;
+        if (file) {
+                key = file->private_data;
+                ASSERT(key != NULL);
+                ret = afs_page_filler(key, page);
+        } else {
+                struct inode *inode = page->mapping->host;
+                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+                if (IS_ERR(key)) {
+                        ret = PTR_ERR(key);
+                } else {
+                        ret = afs_page_filler(key, page);
+                        key_put(key);
+                }
+        }
+        return ret;
+}
+/*
 * read a set of pages
 */
 static int afs_readpages(struct file *file, struct address_space *mapping,
                         struct list_head *pages, unsigned nr_pages)
 {
+        struct key *key = file->private_data;
        struct afs_vnode *vnode;
        int ret = 0;
-        _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
+        _enter("{%d},{%lu},,%d",
+               key_serial(key), mapping->host->i_ino, nr_pages);
+        ASSERT(key != NULL);
        vnode = AFS_FS_I(mapping->host);
        if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
        }
        /* load the missing pages from the network */
-        ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+        ret = read_cache_pages(mapping, pages, afs_page_filler, key);
        _leave(" = %d [netting]", ret);
        return ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844f..807f284cc75e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
 /*
 * flock.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d6..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
 */
 int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
-        struct file file = {
-                .private_data = key,
-        };
        struct page *page;
        size_t size;
        char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
               vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
        /* read the contents of the symlink into the pagecache */
-        page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
+        page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+                               afs_page_filler, key);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
                goto out;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda83..9bd4b3876c99 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
-        inode->i_mode = S_IRUSR | S_IWUSR;
+        inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/smp_lock.h>
 #include "autofs_i.h"
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
-static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
-        .ioctl          = autofs4_root_ioctl,
+        .unlocked_ioctl = autofs4_root_ioctl,
 };
 const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
 * ioctl()'s on the root directory is the chief method for the daemon to
 * generate kernel reactions
 */
-static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
+                                       unsigned int cmd, unsigned long arg)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
        void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
                return -ENOSYS;
        }
 }
+static long autofs4_root_ioctl(struct file *filp,
+                               unsigned int cmd, unsigned long arg)
+{
+        long ret;
+        struct inode *inode = filp->f_dentry->d_inode;
+        lock_kernel();
+        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+        unlock_kernel();
+        return ret;
+}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..8f73841fc974 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        }
        set_bit(ino, info->si_imap);
        info->si_freei--;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
        inode->i_op = &bfs_file_inops;
        inode->i_fop = &bfs_file_operations;
        inode->i_mapping->a_ops = &bfs_aops;
-        inode->i_mode = mode;
        inode->i_ino = ino;
        BFS_I(inode)->i_dsk_ino = ino;
        BFS_I(inode)->i_sblock = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..26e5f5026620 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
        sb = get_active_super(bdev);
        if (!sb)
                goto out;
-        if (sb->s_flags & MS_RDONLY) {
+        error = freeze_super(sb);
-                sb->s_frozen = SB_FREEZE_TRANS;
+        if (error) {
-                up_write(&sb->s_umount);
+                deactivate_super(sb);
+                bdev->bd_fsfreeze_count--;
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                return sb;
+                return ERR_PTR(error);
-        }
-        sb->s_frozen = SB_FREEZE_WRITE;
-        smp_wmb();
-        sync_filesystem(sb);
-        sb->s_frozen = SB_FREEZE_TRANS;
-        smp_wmb();
-        sync_blockdev(sb->s_bdev);
-        if (sb->s_op->freeze_fs) {
-                error = sb->s_op->freeze_fs(sb);
-                if (error) {
-                        printk(KERN_ERR
-                                "VFS:Filesystem freeze failed\n");
-                        sb->s_frozen = SB_UNFROZEN;
-                        deactivate_locked_super(sb);
-                        bdev->bd_fsfreeze_count--;
-                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                        return ERR_PTR(error);
-                }
        }
-        up_write(&sb->s_umount);
+        deactivate_super(sb);
 out:
        sync_blockdev(bdev);
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
        mutex_lock(&bdev->bd_fsfreeze_mutex);
        if (!bdev->bd_fsfreeze_count)
-                goto out_unlock;
+                goto out;
        error = 0;
        if (--bdev->bd_fsfreeze_count > 0)
-                goto out_unlock;
+                goto out;
        if (!sb)
-                goto out_unlock;
+                goto out;
-        BUG_ON(sb->s_bdev != bdev);
-        down_write(&sb->s_umount);
-        if (sb->s_flags & MS_RDONLY)
-                goto out_unfrozen;
-        if (sb->s_op->unfreeze_fs) {
-                error = sb->s_op->unfreeze_fs(sb);
-                if (error) {
-                        printk(KERN_ERR
-                                "VFS:Filesystem thaw failed\n");
-                        sb->s_frozen = SB_FREEZE_TRANS;
-                        bdev->bd_fsfreeze_count++;
-                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
-                        return error;
-                }
-        }
-out_unfrozen:
-        sb->s_frozen = SB_UNFROZEN;
-        smp_wmb();
-        wake_up(&sb->s_wait_unfrozen);
-        if (sb)
+        error = thaw_super(sb);
-                deactivate_locked_super(sb);
+        if (error) {
-out_unlock:
+                bdev->bd_fsfreeze_count++;
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return error;
+        }
+out:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return 0;
 }
@@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
         */
        mutex_unlock(&bd_inode->i_mutex);
-        error = blkdev_issue_flush(bdev, NULL);
+        error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
        if (error == -EOPNOTSUPP)
                error = 0;
@@ -668,41 +627,209 @@ void bd_forget(struct inode *inode)
                iput(bdev->bd_inode);
 }
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+                         void *holder)
 {
-        int res;
-        spin_lock(&bdev_lock);
-        /* first decide result */
        if (bdev->bd_holder == holder)
-                res = 0;         /* already a holder */
+                return true;     /* already a holder */
        else if (bdev->bd_holder != NULL)
-                res = -EBUSY;    /* held by someone else */
+                return false;    /* held by someone else */
        else if (bdev->bd_contains == bdev)
-                res = 0;         /* is a whole device which isn't held */
+                return true;     /* is a whole device which isn't held */
-        else if (bdev->bd_contains->bd_holder == bd_claim)
+        else if (whole->bd_holder == bd_claim)
-                res = 0;         /* is a partition of a device that is being partitioned */
+                return true;     /* is a partition of a device that is being partitioned */
-        else if (bdev->bd_contains->bd_holder != NULL)
+        else if (whole->bd_holder != NULL)
-                res = -EBUSY;    /* is a partition of a held device */
+                return false;    /* is a partition of a held device */
        else
-                res = 0;         /* is a partition of an un-held device */
+                return true;     /* is a partition of an un-held device */
+}
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev.  This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress.  This function doesn't actually claim.  On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).  Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+                               struct block_device *whole, void *holder)
+{
+retry:
+        /* if someone else claimed, fail */
+        if (!bd_may_claim(bdev, whole, holder))
+                return -EBUSY;
+        /* if someone else is claiming, wait for it to finish */
+        if (whole->bd_claiming && whole->bd_claiming != holder) {
+                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+                DEFINE_WAIT(wait);
+                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&bdev_lock);
+                schedule();
+                finish_wait(wq, &wait);
+                spin_lock(&bdev_lock);
+                goto retry;
+        }
+        /* yay, all mine */
+        return 0;
+}
-        /* now impose change */
+/**
-        if (res==0) {
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively.  Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress.  Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming().  If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+                                              void *holder)
+{
+        struct gendisk *disk;
+        struct block_device *whole;
+        int partno, err;
+        might_sleep();
+        /*
+         * @bdev might not have been initialized properly yet, look up
+         * and grab the outer block device the hard way.
+         */
+        disk = get_gendisk(bdev->bd_dev, &partno);
+        if (!disk)
+                return ERR_PTR(-ENXIO);
+        whole = bdget_disk(disk, 0);
+        put_disk(disk);
+        if (!whole)
+                return ERR_PTR(-ENOMEM);
+        /* prepare to claim, if successful, mark claiming in progress */
+        spin_lock(&bdev_lock);
+        err = bd_prepare_to_claim(bdev, whole, holder);
+        if (err == 0) {
+                whole->bd_claiming = holder;
+                spin_unlock(&bdev_lock);
+                return whole;
+        } else {
+                spin_unlock(&bdev_lock);
+                bdput(whole);
+                return ERR_PTR(err);
+        }
+}
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        BUG_ON(whole->bd_claiming != holder);
+        whole->bd_claiming = NULL;
+        wake_up_bit(&whole->bd_claiming, 0);
+        spin_unlock(&bdev_lock);
+        bdput(whole);
+}
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming().  Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
+}
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully.  This
+ * function may be called with or without preceding
+ * blk_start_claiming().  In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+        struct block_device *whole = bdev->bd_contains;
+        int res;
+        might_sleep();
+        spin_lock(&bdev_lock);
+        res = bd_prepare_to_claim(bdev, whole, holder);
+        if (res == 0) {
                /* note that for a whole device bd_holders
                 * will be incremented twice, and bd_holder will
                 * be set to bd_claim before being set to holder
                 */
-                bdev->bd_contains->bd_holders ++;
+                whole->bd_holders++;
-                bdev->bd_contains->bd_holder = bd_claim;
+                whole->bd_holder = bd_claim;
                bdev->bd_holders++;
                bdev->bd_holder = holder;
        }
-        spin_unlock(&bdev_lock);
+        if (whole->bd_claiming)
+                __bd_abort_claiming(whole, holder);     /* releases bdev_lock */
+        else
+                spin_unlock(&bdev_lock);
        return res;
 }
 EXPORT_SYMBOL(bd_claim);
 void bd_release(struct block_device *bdev)
@@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get);
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
+        struct block_device *whole = NULL;
        struct block_device *bdev;
        int res;
@@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (bdev == NULL)
                return -ENOMEM;
+        if (filp->f_mode & FMODE_EXCL) {
+                whole = bd_start_claiming(bdev, filp);
+                if (IS_ERR(whole)) {
+                        bdput(bdev);
+                        return PTR_ERR(whole);
+                }
+        }
        filp->f_mapping = bdev->bd_inode->i_mapping;
        res = blkdev_get(bdev, filp->f_mode);
-        if (res)
-                return res;
-        if (filp->f_mode & FMODE_EXCL) {
+        if (whole) {
-                res = bd_claim(bdev, filp);
+                if (res == 0)
-                if (res)
+                        BUG_ON(bd_claim(bdev, filp) != 0);
-                        goto out_blkdev_put;
+                else
+                        bd_abort_claiming(whole, filp);
        }
-        return 0;
- out_blkdev_put:
-        blkdev_put(bdev, filp->f_mode);
        return res;
 }
@@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev);
 */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-        struct block_device *bdev;
+        struct block_device *bdev, *whole;
-        int error = 0;
+        int error;
        bdev = lookup_bdev(path);
        if (IS_ERR(bdev))
                return bdev;
+        whole = bd_start_claiming(bdev, holder);
+        if (IS_ERR(whole)) {
+                bdput(bdev);
+                return whole;
+        }
        error = blkdev_get(bdev, mode);
        if (error)
-                return ERR_PTR(error);
+                goto out_abort_claiming;
        error = -EACCES;
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-                goto blkdev_put;
+                goto out_blkdev_put;
-        error = bd_claim(bdev, holder);
-        if (error)
-                goto blkdev_put;
+        BUG_ON(bd_claim(bdev, holder) != 0);
        return bdev;
-        
-blkdev_put:
+out_blkdev_put:
        blkdev_put(bdev, mode);
+out_abort_claiming:
+        bd_abort_claiming(whole, holder);
        return ERR_PTR(error);
 }
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724ec..8d432cd9d580 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode)
        return ret;
 }
-struct xattr_handler btrfs_xattr_acl_default_handler = {
+const struct xattr_handler btrfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .get    = btrfs_xattr_acl_get,
        .set    = btrfs_xattr_acl_set,
 };
-struct xattr_handler btrfs_xattr_acl_access_handler = {
+const struct xattr_handler btrfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .get    = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
-                             DISCARD_FL_BARRIER);
+                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..d601629b85d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if (ret != 0)
                goto fail;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir && (dir->i_mode & S_ISGID)) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..59acd3eb288a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -282,7 +282,7 @@ err:
 * List of handlers for synthetic system.* attributes.  All real ondisk
 * attributes are handled directly.
 */
-struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler *btrfs_xattr_handlers[] = {
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
        &btrfs_xattr_acl_access_handler,
        &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
 #include <linux/xattr.h>
-extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern const struct xattr_handler btrfs_xattr_acl_default_handler;
-extern struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler *btrfs_xattr_handlers[];
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
                void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..e8aa7081d25c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
                return;
        invalidate_bh_lrus();
+        lru_add_drain_all();    /* make sure all lru add caches are flushed */
        invalidate_mapping_pages(mapping, 0, -1);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
        return err;
 }
-static void do_thaw_all(struct work_struct *work)
+static void do_thaw_one(struct super_block *sb, void *unused)
 {
-        struct super_block *sb;
        char b[BDEVNAME_SIZE];
+        while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+                printk(KERN_WARNING "Emergency Thaw on %s\n",
+                       bdevname(sb->s_bdev, b));
+}
-        spin_lock(&sb_lock);
+static void do_thaw_all(struct work_struct *work)
-restart:
+{
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        iterate_supers(do_thaw_one, NULL);
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
-                        printk(KERN_WARNING "Emergency Thaw on %s\n",
-                               bdevname(sb->s_bdev, b));
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
        int rc = 0;
        struct page **pages;
-        struct pagevec pvec;
        loff_t offset;
        u64 len;
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        if (rc < 0)
                goto out;
-        /* set uptodate and add to lru in pagevec-sized chunks */
-        pagevec_init(&pvec, 0);
        for (; !list_empty(page_list) && len > 0;
             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
                struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
                }
-                if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
                        page_cache_release(page);
                        dout("readpages %p add_to_page_cache failed %p\n",
                             inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (pagevec_add(&pvec, page) == 0)
+                page_cache_release(page);
-                        pagevec_lru_add_file(&pvec);   /* add to lru */
        }
-        pagevec_lru_add_file(&pvec);
        rc = 0;
 out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        ceph_release_pages(req->r_pages, req->r_num_pages);
        if (req->r_pages_from_pool)
                mempool_free(req->r_pages,
-                             ceph_client(inode->i_sb)->wb_pagevec_pool);
+                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
        else
                kfree(req->r_pages);
        ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..9f46de2ba7a7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
        if (ret < 0) {
-                pr_err("error %d building request\n", ret);
+                pr_err("error %d building auth method %s request\n", ret,
+                       ac->ops->name);
                return ret;
        }
        dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
                if (ac->protocol != protocol) {
                        ret = ceph_auth_init_protocol(ac, protocol);
                        if (ret) {
-                                pr_err("error %d on auth protocol %d init\n",
+                                pr_err("error %d on auth method %s init\n",
-                                       ret, protocol);
+                                       ret, ac->ops->name);
                                goto out;
                        }
                }
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
        if (ret == -EAGAIN) {
                return ceph_build_auth_request(ac, reply_buf, reply_len);
        } else if (ret) {
-                pr_err("authentication error %d\n", ret);
+                pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
                return ret;
        }
        return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..4429a707c021 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
 struct ceph_authorizer;
 struct ceph_auth_client_ops {
+        const char *name;
        /*
         * true if we are authenticated and can connect to
         * services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..24407c119291 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
 }
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+        .name = "none",
        .reset = reset,
        .destroy = destroy,
        .is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..7b206231566d 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        int ret;
        char *dbuf;
        char *ticket_buf;
-        u8 struct_v;
+        u8 reply_struct_v;
        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
        if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                goto out_dbuf;
        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        struct_v = ceph_decode_8(&p);
+        reply_struct_v = ceph_decode_8(&p);
-        if (struct_v != 1)
+        if (reply_struct_v != 1)
                goto bad;
        num = ceph_decode_32(&p);
        dout("%d tickets\n", num);
        while (num--) {
                int type;
-                u8 struct_v;
+                u8 tkt_struct_v, blob_struct_v;
                struct ceph_x_ticket_handler *th;
                void *dp, *dend;
                int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                type = ceph_decode_32(&p);
                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-                struct_v = ceph_decode_8(&p);
+                tkt_struct_v = ceph_decode_8(&p);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                dend = dbuf + dlen;
                dp = dbuf;
-                struct_v = ceph_decode_8(&dp);
+                tkt_struct_v = ceph_decode_8(&dp);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                tpend = tp + dlen;
                dout(" ticket blob is %d bytes\n", dlen);
                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-                struct_v = ceph_decode_8(&tp);
+                blob_struct_v = ceph_decode_8(&tp);
                new_secret_id = ceph_decode_64(&tp);
                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
                if (ret)
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 static const struct ceph_auth_client_ops ceph_x_ops = {
+        .name = "x",
        .is_authenticated = ceph_x_is_authenticated,
        .build_request = ceph_x_build_request,
        .handle_reply = ceph_x_handle_reply,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..0dd0b81e64f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 {
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
             seq, issue_seq, mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        msg->hdr.tid = cpu_to_le64(flush_tid);
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 */
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        struct inode *inode = &ci->vfs_inode;
        int was = ci->i_dirty_caps;
        int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int flushing;
@@ -1663,7 +1665,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                          unsigned *flush_tid)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int unlock_session = session ? 0 : 1;
        int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
 static int caps_are_flushed(struct inode *inode, unsigned tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int dirty, i, ret = 1;
+        int i, ret = 1;
        spin_lock(&inode->i_lock);
-        dirty = __ceph_caps_dirty(ci);
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((ci->i_flushing_caps & (1 << i)) &&
                    ci->i_cap_flush_tid[i] <= tid) {
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                        err = wait_event_interruptible(ci->i_cap_wq,
                                       caps_are_flushed(inode, flush_tid));
        } else {
-                struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&inode->i_lock);
                if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        __releases(inode->i_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..3b9eeed097b3 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
 * Ceph release version
 */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
 #define CEPH_VERSION_PATCH 0
 #define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
 * client-facing protocol.
 */
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -53,8 +53,18 @@
 /*
 * feature bits
 */
-#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_UID        1
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
 /*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_AUTH_NONE          0x1
 #define CEPH_AUTH_CEPHX         0x2
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 /*********************************************
 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_SNAP            0x312
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
 /* osd */
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
+/* pool operations */
+enum {
+  POOL_OP_CREATE                        = 0x01,
+  POOL_OP_DELETE                        = 0x02,
+  POOL_OP_AUID_CHANGE                   = 0x03,
+  POOL_OP_CREATE_SNAP                   = 0x11,
+  POOL_OP_DELETE_SNAP                   = 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP         = 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP         = 0x22,
+};
 struct ceph_mon_request_header {
        __le64 have_version;
        __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
        struct ceph_statfs st;
 } __attribute__ ((packed));
+const char *ceph_pool_op_name(int op);
+struct ceph_mon_poolop {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 pool;
+        __le32 op;
+        __le64 auid;
+        __le64 snapid;
+        __le32 name_len;
+} __attribute__ ((packed));
+struct ceph_mon_poolop_reply {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 reply_code;
+        __le32 epoch;
+        char has_data;
+        char data[0];
+} __attribute__ ((packed));
+struct ceph_mon_unmanaged_snap {
+        __le64 snapid;
+} __attribute__ ((packed));
 struct ceph_osd_getmap {
        struct ceph_mon_request_header monhdr;
        struct ceph_fsid fsid;
@@ -308,6 +361,7 @@ union ceph_mds_request_args {
        struct {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
+                __le32 max_bytes;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
        case CEPH_ENTITY_TYPE_OSD: return "osd";
        case CEPH_ENTITY_TYPE_MON: return "mon";
        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
        case CEPH_ENTITY_TYPE_AUTH: return "auth";
        default: return "unknown";
        }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+        case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
        case CEPH_OSD_OP_PULL: return "pull";
        case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
        }
        return "???";
 }
+const char *ceph_pool_op_name(int op)
+{
+        switch (op) {
+        case POOL_OP_CREATE: return "create";
+        case POOL_OP_DELETE: return "delete";
+        case POOL_OP_AUID_CHANGE: return "auid change";
+        case POOL_OP_CREATE_SNAP: return "create snap";
+        case POOL_OP_DELETE_SNAP: return "delete snap";
+        case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+        case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+        }
+        return "???";
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 static int monc_show(struct seq_file *s, void *p)
 {
        struct ceph_client *client = s->private;
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_client *monc = &client->monc;
        struct rb_node *rp;
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
        if (monc->want_next_osdmap)
                seq_printf(s, "want next osdmap\n");
-        for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-                req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+                __u16 op;
-                seq_printf(s, "%lld statfs\n", req->tid);
+                req = rb_entry(rp, struct ceph_mon_generic_request, node);
+                op = le16_to_cpu(req->request->hdr.type);
+                if (op == CEPH_MSG_STATFS)
+                        seq_printf(s, "%lld statfs\n", req->tid);
+                else
+                        seq_printf(s, "%lld unknown\n", req->tid);
        }
        mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..4fd30900eff7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
                return -ENOMEM;          /* oh well */
        spin_lock(&dentry->d_lock);
-        if (dentry->d_fsdata) /* lost a race */
+        if (dentry->d_fsdata) {
+                /* lost a race */
+                kmem_cache_free(ceph_dentry_cachep, di);
                goto out_unlock;
+        }
        di->dentry = dentry;
        di->lease_session = NULL;
        dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
        dentry = list_entry(p, struct dentry, d_u.d_child);
        di = ceph_dentry(dentry);
        while (1) {
-                dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+                dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+                     d_unhashed(dentry) ? "!hashed" : "hashed",
                     parent->d_subdirs.prev, parent->d_subdirs.next);
                if (p == &parent->d_subdirs) {
                        fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
        const int max_entries = client->mount_args->max_readdir;
+        const int max_bytes = client->mount_args->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
        if (fi->at_end)
@@ -312,6 +317,7 @@ more:
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+                req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
                req->r_num_caps = max_entries + 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
@@ -335,7 +341,7 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 0;
+                        fi->next_offset = 2;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                  struct dentry *dentry, int err)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = dentry->d_parent->d_inode;
        /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                    !is_root_ceph_dentry(dir, dentry) &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-                        di->offset = ci->i_max_offset++;
                        spin_unlock(&dir->i_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
                        d_add(dentry, NULL);
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                /* ensure target dentry is invalidated, despite
                   rehashing bug in vfs_rename_dir */
-                new_dentry->d_time = jiffies;
+                ceph_invalidate_dentry_lease(new_dentry);
-                ceph_dentry(new_dentry)->lease_shared_gen = 0;
        }
        ceph_mdsc_put_request(req);
        return err;
 }
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        dentry->d_time = jiffies;
+        ceph_dentry(dentry)->lease_shared_gen = 0;
+        spin_unlock(&dentry->d_lock);
+}
 /*
 * Check if dentry lease is valid.  If not, delete the lease.  Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
-             dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+             ceph_dentry(dentry)->offset);
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
-        if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+        if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_add_tail(&di->lru, &mdsc->dentry_lru);
                mdsc->num_dentry++;
@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        struct ceph_dentry_info *di = ceph_dentry(dn);
        struct ceph_mds_client *mdsc;
-        dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
-             dn->d_name.len, dn->d_name.name);
+             dn->d_name.len, dn->d_name.name, di->offset);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_move_tail(&di->lru, &mdsc->dentry_lru);
                spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_del_init(&di->lru);
                mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..17447644d675 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
                       fh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                      struct ceph_nfs_confh *cfh)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
        }
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..6512b6701b9e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
 * allocate a vector new pages
 */
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
        struct page **pages;
        int i;
-        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        pages = kmalloc(sizeof(*pages) * num_pages, flags);
        if (!pages)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < num_pages; i++) {
-                pages[i] = alloc_page(GFP_NOFS);
+                pages[i] = __page_cache_alloc(flags);
                if (pages[i] == NULL) {
                        ceph_release_page_vector(pages, i);
                        return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
                 * in sequence.
                 */
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
                                    &mtime, false, 2);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        num_pages = calc_pages_for(pos, len);
@@ -668,7 +668,7 @@ more:
                truncate_inode_pages_range(inode->i_mapping, pos, 
                                           (pos+len) | (PAGE_CACHE_SIZE-1));
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        loff_t endoff = pos + iov->iov_len;
        int got = 0;
        int ret, err;
@@ -844,8 +844,7 @@ retry_snap:
                if ((ret >= 0 || ret == -EIOCBQUEUED) &&
                    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
                     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-                        err = vfs_fsync_range(file, file->f_path.dentry,
+                        err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
-                                              pos, pos + ret - 1, 1);
                        if (err < 0)
                                ret = err;
                }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..a81b8b662c7b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
         */
        if (ci->i_snap_realm) {
                struct ceph_mds_client *mdsc =
-                        &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+                        &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                struct ceph_snap_realm *realm = ci->i_snap_realm;
                dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
                        memcpy(ci->i_xattrs.blob->vec.iov_base,
                               iinfo->xattr_data, iinfo->xattr_len);
                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+                xattr_blob = NULL;
        }
        inode->i_mapping->a_ops = &ceph_aops;
        inode->i_mapping->backing_dev_info =
-                &ceph_client(inode->i_sb)->backing_dev_info;
+                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
                /* set dir completion flag? */
                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                    ceph_snap(inode) == CEPH_NOSNAP &&
-                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+                if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                        inode->i_size = ci->i_rbytes;
                break;
        default:
@@ -802,6 +804,37 @@ out_unlock:
 }
 /*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+        struct dentry *dir = dn->d_parent;
+        struct inode *inode = dn->d_parent->d_inode;
+        struct ceph_dentry_info *di;
+        BUG_ON(!inode);
+        di = ceph_dentry(dn);
+        spin_lock(&inode->i_lock);
+        if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+                spin_unlock(&inode->i_lock);
+                return;
+        }
+        di->offset = ceph_inode(inode)->i_max_offset++;
+        spin_unlock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        spin_lock(&dn->d_lock);
+        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+             dn->d_u.d_child.prev, dn->d_u.d_child.next);
+        spin_unlock(&dn->d_lock);
+        spin_unlock(&dcache_lock);
+}
+/*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
 *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 {
        struct dentry *realdn;
+        BUG_ON(dn->d_inode);
        /* dn must be unhashed */
        if (!d_unhashed(dn))
                d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                dn = realdn;
        } else {
                BUG_ON(!ceph_dentry(dn));
                dout("dn %p attached to %p ino %llx.%llx\n",
                     dn, dn->d_inode, ceph_vinop(dn->d_inode));
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
+        ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-        struct dentry *dir = dn->d_parent;
-        struct inode *inode = dn->d_parent->d_inode;
-        struct ceph_dentry_info *di;
-        BUG_ON(!inode);
-        di = ceph_dentry(dn);
-        spin_lock(&inode->i_lock);
-        di->offset = ceph_inode(inode)->i_max_offset++;
-        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
-        spin_lock(&dn->d_lock);
-        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
-        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-             dn->d_u.d_child.prev, dn->d_u.d_child.next);
-        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
-}
-/*
 * Incorporate results into the local cache.  This is either just
 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
 * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-                if (rinfo->head->result == 0 && req->r_locked_dir) {
+                if (rinfo->head->result == 0 && req->r_locked_dir)
-                        struct ceph_inode_info *ci =
+                        ceph_invalidate_dir_request(req);
-                                ceph_inode(req->r_locked_dir);
-                        dout(" clearing %p complete (empty trace)\n",
-                             req->r_locked_dir);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
                return 0;
        }
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                             req->r_old_dentry->d_name.len,
                             req->r_old_dentry->d_name.name,
                             dn, dn->d_name.len, dn->d_name.name);
                        /* ensure target dentry is invalidated, despite
                           rehashing bug in vfs_rename_dir */
-                        dn->d_time = jiffies;
+                        ceph_invalidate_dentry_lease(dn);
-                        ceph_dentry(dn)->lease_shared_gen = 0;
                        /* take overwritten dentry's readdir offset */
+                        dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                             req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                             ceph_dentry(req->r_old_dentry)->offset);
                        ceph_dentry(req->r_old_dentry)->offset =
                                ceph_dentry(dn)->offset;
                        dn = req->r_old_dentry;  /* use old_dentry */
                        in = dn->d_inode;
                }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                goto done;
                        }
                        req->r_dentry = dn;  /* may have spliced */
-                        ceph_set_dentry_offset(dn);
                        igrab(in);
                } else if (ceph_ino(in) == vino.ino &&
                           ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        err = PTR_ERR(dn);
                        goto done;
                }
-                ceph_set_dentry_offset(dn);
                req->r_dentry = dn;  /* may have spliced */
                igrab(in);
                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+        if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                       &ci->i_vmtruncate_work)) {
                dout("ceph_queue_vmtruncate %p\n", inode);
                igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *parent_inode = dentry->d_parent->d_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_ioctl_dataloc dl;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..885aa5710cfd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
 static void __wake_requests(struct ceph_mds_client *mdsc,
                            struct list_head *head);
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
 /*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
        struct ceph_msg *msg;
        struct ceph_mds_session_head *h;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                pr_err("create_session_msg ENOMEM creating msg\n");
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        h = msg->front.iov_base;
        h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int mstate;
        int mds = session->s_mds;
-        int err = 0;
        /* wait for mds to go active? */
        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
        /* send connect message */
        msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-        if (IS_ERR(msg)) {
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-                goto out;
-        }
        ceph_con_send(&session->s_con, msg);
-out:
        return 0;
 }
@@ -804,12 +799,49 @@ out:
 }
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-                                   void *arg)
+                                  void *arg)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+        int drop = 0;
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
-        ceph_remove_cap(cap);
+        spin_lock(&inode->i_lock);
+        __ceph_remove_cap(cap);
+        if (!__ceph_is_any_real_caps(ci)) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                spin_lock(&mdsc->cap_dirty_lock);
+                if (!list_empty(&ci->i_dirty_item)) {
+                        pr_info(" dropping dirty %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_dirty_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_dirty_caps = 0;
+                        list_del_init(&ci->i_dirty_item);
+                        drop = 1;
+                }
+                if (!list_empty(&ci->i_flushing_item)) {
+                        pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_flushing_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_flushing_caps = 0;
+                        list_del_init(&ci->i_flushing_item);
+                        mdsc->num_cap_flushing--;
+                        drop = 1;
+                }
+                if (drop && ci->i_wrbuffer_ref) {
+                        pr_info(" dropping dirty data for %p %lld\n",
+                                inode, ceph_ino(inode));
+                        ci->i_wrbuffer_ref = 0;
+                        ci->i_wrbuffer_ref_head = 0;
+                        drop++;
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
+        }
+        spin_unlock(&inode->i_lock);
+        while (drop--)
+                iput(inode);
        return 0;
 }
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
        dout("remove_session_caps on %p\n", session);
        iterate_session_caps(session, remove_session_caps_cb, NULL);
        BUG_ON(session->s_nr_caps > 0);
+        BUG_ON(!list_empty(&session->s_cap_flushing));
        cleanup_cap_releases(session);
 }
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
                ceph_mds_state_name(state));
        msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
                                 ++session->s_renew_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        ceph_con_send(&session->s_con, msg);
        return 0;
 }
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
                                 struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
-        int err = 0;
        dout("request_close_session mds%d state %s seq %lld\n",
             session->s_mds, session_state_name(session->s_state),
             session->s_seq);
        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-        else
+        ceph_con_send(&session->s_con, msg);
-                ceph_con_send(&session->s_con, msg);
+        return 0;
-        return err;
 }
 /*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
                spin_unlock(&session->s_cap_lock);
                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-                                   0, 0, NULL);
+                                   GFP_NOFS);
                if (!msg)
                        goto out_unlocked;
                dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        dout("send_cap_releases mds%d\n", session->s_mds);
-        while (1) {
+        spin_lock(&session->s_cap_lock);
-                spin_lock(&session->s_cap_lock);
+        while (!list_empty(&session->s_cap_releases_done)) {
-                if (list_empty(&session->s_cap_releases_done))
-                        break;
                msg = list_first_entry(&session->s_cap_releases_done,
                                 struct ceph_msg, list_head);
                list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
                ceph_con_send(&session->s_con, msg);
+                spin_lock(&session->s_cap_lock);
        }
        spin_unlock(&session->s_cap_lock);
 }
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        unsigned num;
+        dout("discard_cap_releases mds%d\n", session->s_mds);
+        spin_lock(&session->s_cap_lock);
+        /* zero out the in-progress message */
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        head = msg->front.iov_base;
+        num = le32_to_cpu(head->num);
+        dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+        head->num = cpu_to_le32(0);
+        session->s_num_cap_releases += num;
+        /* requeue completed messages */
+        while (!list_empty(&session->s_cap_releases_done)) {
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                 struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                head = msg->front.iov_base;
+                num = le32_to_cpu(head->num);
+                dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+                     num);
+                session->s_num_cap_releases += num;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                list_add(&msg->list_head, &session->s_cap_releases);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
 /*
 * requests
 */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        if (!req)
                return ERR_PTR(-ENOMEM);
+        mutex_init(&req->r_fill_mutex);
        req->r_started = jiffies;
        req->r_resend_mds = -1;
        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
                        len += 1 + temp->d_name.len;
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                        pr_err("build_path corrupt dentry %p\n", dentry);
                        return ERR_PTR(-EINVAL);
                }
        }
@@ -1267,7 +1336,7 @@ retry:
                struct inode *inode = temp->d_inode;
                if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-                        dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                        dout("build_path path+%d: %p SNAPDIR\n",
                             pos, temp);
                } else if (stop_on_nosnap && inode &&
                           ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
                                break;
                        strncpy(path + pos, temp->d_name.name,
                                temp->d_name.len);
-                        dout("build_path_dentry path+%d: %p '%.*s'\n",
-                             pos, temp, temp->d_name.len, path + pos);
                }
                if (pos)
                        path[--pos] = '/';
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry\n");
+                        pr_err("build_path corrupt dentry\n");
                        kfree(path);
                        return ERR_PTR(-EINVAL);
                }
        }
        if (pos != 0) {
-                pr_err("build_path_dentry did not end path lookup where "
+                pr_err("build_path did not end path lookup where "
                       "expected, namelen is %d, pos is %d\n", len, pos);
                /* presumably this is only possible if racing with a
                   rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
-        dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+        dout("build_path on %p %d built %llx '%.*s'\n",
             dentry, atomic_read(&dentry->d_count), *base, len, path);
        return path;
 }
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        if (req->r_old_dentry_drop)
                len += req->r_old_dentry->d_name.len;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg) {
+                msg = ERR_PTR(-ENOMEM);
                goto out_free2;
+        }
        msg->hdr.tid = cpu_to_le64(req->r_tid);
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        }
        msg = create_request_message(mdsc, req, mds);
        if (IS_ERR(msg)) {
-                req->r_reply = ERR_PTR(PTR_ERR(msg));
+                req->r_err = PTR_ERR(msg);
                complete_request(mdsc, req);
-                return -PTR_ERR(msg);
+                return PTR_ERR(msg);
        }
        req->r_request = msg;
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = -EAGAIN;
-        if (req->r_reply)
+        if (req->r_err || req->r_got_result)
                goto out;
        if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
        return err;
 finish:
-        req->r_reply = ERR_PTR(err);
+        req->r_err = err;
        complete_request(mdsc, req);
        goto out;
 }
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 /*
 * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
+ * resubmit their requests to a possibly different mds.
- * wake up if their requests has been forwarded to @mds, too.
 */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
        struct ceph_mds_request *req;
        struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        __register_request(mdsc, req, dir);
        __do_request(mdsc, req);
-        /* wait */
+        if (req->r_err) {
-        if (!req->r_reply) {
+                err = req->r_err;
-                mutex_unlock(&mdsc->mutex);
+                __unregister_request(mdsc, req);
-                if (req->r_timeout) {
+                dout("do_request early error %d\n", err);
-                        err = (long)wait_for_completion_interruptible_timeout(
+                goto out;
-                                &req->r_completion, req->r_timeout);
-                        if (err == 0)
-                                req->r_reply = ERR_PTR(-EIO);
-                        else if (err < 0)
-                                req->r_reply = ERR_PTR(err);
-                } else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-                }
-                mutex_lock(&mdsc->mutex);
        }
-        if (IS_ERR(req->r_reply)) {
+        /* wait */
-                err = PTR_ERR(req->r_reply);
+        mutex_unlock(&mdsc->mutex);
-                req->r_reply = NULL;
+        dout("do_request waiting\n");
+        if (req->r_timeout) {
+                err = (long)wait_for_completion_interruptible_timeout(
+                        &req->r_completion, req->r_timeout);
+                if (err == 0)
+                        err = -EIO;
+        } else {
+                err = wait_for_completion_interruptible(&req->r_completion);
+        }
+        dout("do_request waited, got %d\n", err);
+        mutex_lock(&mdsc->mutex);
-                if (err == -ERESTARTSYS) {
+        /* only abort if we didn't race with a real reply */
-                        /* aborted */
+        if (req->r_got_result) {
-                        req->r_aborted = true;
+                err = le32_to_cpu(req->r_reply_info.head->result);
+        } else if (err < 0) {
+                dout("aborted request %lld with %d\n", req->r_tid, err);
-                        if (req->r_locked_dir &&
+                /*
-                            (req->r_op & CEPH_MDS_OP_WRITE)) {
+                 * ensure we aren't running concurrently with
-                                struct ceph_inode_info *ci =
+                 * ceph_fill_trace or ceph_readdir_prepopulate, which
-                                        ceph_inode(req->r_locked_dir);
+                 * rely on locks (dir mutex) held by our caller.
+                 */
+                mutex_lock(&req->r_fill_mutex);
+                req->r_err = err;
+                req->r_aborted = true;
+                mutex_unlock(&req->r_fill_mutex);
-                                dout("aborted, clearing I_COMPLETE on %p\n", 
+                if (req->r_locked_dir &&
-                                     req->r_locked_dir);
+                    (req->r_op & CEPH_MDS_OP_WRITE))
-                                spin_lock(&req->r_locked_dir->i_lock);
+                        ceph_invalidate_dir_request(req);
-                                ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                                ci->i_release_count++;
-                                spin_unlock(&req->r_locked_dir->i_lock);
-                        }
-                } else {
-                        /* clean up this request */
-                        __unregister_request(mdsc, req);
-                        if (!list_empty(&req->r_unsafe_item))
-                                list_del_init(&req->r_unsafe_item);
-                        complete(&req->r_safe_completion);
-                }
-        } else if (req->r_err) {
-                err = req->r_err;
        } else {
-                err = le32_to_cpu(req->r_reply_info.head->result);
+                err = req->r_err;
        }
-        mutex_unlock(&mdsc->mutex);
+out:
+        mutex_unlock(&mdsc->mutex);
        dout("do_request %p done, result %d\n", req, err);
        return err;
 }
 /*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+        struct inode *inode = req->r_locked_dir;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+        ci->i_release_count++;
+        spin_unlock(&inode->i_lock);
+        if (req->r_dentry)
+                ceph_invalidate_dentry_lease(req->r_dentry);
+        if (req->r_old_dentry)
+                ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+/*
 * Handle mds reply.
 *
 * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
+        if (req->r_got_safe && !head->safe) {
+                pr_warning("got unsafe after safe on %llu from mds%d\n",
+                           tid, mds);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
        result = le32_to_cpu(head->result);
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                }
-        }
+        } else {
-        BUG_ON(req->r_reply);
-        if (!head->safe) {
                req->r_got_unsafe = true;
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
        }
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
        /* insert trace into our cache */
+        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
        if (err == 0) {
                if (result == 0 && rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(&req->r_caps_reservation);
        }
+        mutex_unlock(&req->r_fill_mutex);
        up_read(&mdsc->snap_rwsem);
 out_err:
-        if (err) {
+        mutex_lock(&mdsc->mutex);
-                req->r_err = err;
+        if (!req->r_aborted) {
+                if (err) {
+                        req->r_err = err;
+                } else {
+                        req->r_reply = msg;
+                        ceph_msg_get(msg);
+                        req->r_got_result = true;
+                }
        } else {
-                req->r_reply = msg;
+                dout("reply arrived after request %lld was aborted\n", tid);
-                ceph_msg_get(msg);
        }
+        mutex_unlock(&mdsc->mutex);
        add_cap_releases(mdsc, req->r_session, -1);
        mutex_unlock(&session->s_mutex);
@@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_SESSION_OPEN:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect success\n", session->s_mds);
                session->s_state = CEPH_MDS_SESSION_OPEN;
                renewed_caps(mdsc, session, 0);
                wake = 1;
@@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session,
                break;
        case CEPH_SESSION_CLOSE:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
                complete(&mdsc->session_close_waiters);
-                kick_requests(mdsc, mds, 0);      /* cur only */
+                kick_requests(mdsc, mds);
                break;
        case CEPH_SESSION_STALE:
@@ -2132,54 +2229,44 @@ out:
 *
 * called with mdsc->mutex held.
 */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *session)
 {
-        struct ceph_mds_session *session = NULL;
        struct ceph_msg *reply;
        struct rb_node *p;
+        int mds = session->s_mds;
        int err = -ENOMEM;
        struct ceph_pagelist *pagelist;
-        pr_info("reconnect to recovering mds%d\n", mds);
+        pr_info("mds%d reconnect start\n", mds);
        pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
        if (!pagelist)
                goto fail_nopagelist;
        ceph_pagelist_init(pagelist);
-        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
-        if (IS_ERR(reply)) {
+        if (!reply)
-                err = PTR_ERR(reply);
                goto fail_nomsg;
-        }
-        /* find session */
-        session = __ceph_lookup_mds_session(mdsc, mds);
-        mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
-        if (session) {
+        mutex_lock(&session->s_mutex);
-                mutex_lock(&session->s_mutex);
+        session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        session->s_seq = 0;
-                session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        ceph_con_open(&session->s_con,
-                session->s_seq = 0;
+                      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-                ceph_con_open(&session->s_con,
+        /* replay unsafe requests */
-                              ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+        replay_unsafe_requests(mdsc, session);
-                /* replay unsafe requests */
-                replay_unsafe_requests(mdsc, session);
-        } else {
-                dout("no session for mds%d, will send short reconnect\n",
-                     mds);
-        }
        down_read(&mdsc->snap_rwsem);
-        if (!session)
-                goto send;
        dout("session %p state %s\n", session,
             session_state_name(session->s_state));
+        /* drop old cap expires; we're about to reestablish that state */
+        discard_cap_releases(mdsc, session);
        /* traverse this session's caps */
        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
        if (err)
@@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
                        goto fail;
        }
-send:
        reply->pagelist = pagelist;
        reply->hdr.data_len = cpu_to_le32(pagelist->length);
        reply->nr_pages = calc_pages_for(0, pagelist->length);
        ceph_con_send(&session->s_con, reply);
-        session->s_state = CEPH_MDS_SESSION_OPEN;
        mutex_unlock(&session->s_mutex);
        mutex_lock(&mdsc->mutex);
        __wake_requests(mdsc, &session->s_waiting);
        mutex_unlock(&mdsc->mutex);
-        ceph_put_mds_session(session);
        up_read(&mdsc->snap_rwsem);
-        mutex_lock(&mdsc->mutex);
        return;
 fail:
        ceph_msg_put(reply);
        up_read(&mdsc->snap_rwsem);
        mutex_unlock(&session->s_mutex);
-        ceph_put_mds_session(session);
 fail_nomsg:
        ceph_pagelist_release(pagelist);
        kfree(pagelist);
 fail_nopagelist:
        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-        mutex_lock(&mdsc->mutex);
        return;
 }
@@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                        }
                        /* kick any requests waiting on the recovering mds */
-                        kick_requests(mdsc, i, 1);
+                        kick_requests(mdsc, i);
                } else if (oldstate == newstate) {
                        continue;  /* nothing new with this mds */
                }
@@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                 * send reconnect?
                 */
                if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-                    newstate >= CEPH_MDS_STATE_RECONNECT)
+                    newstate >= CEPH_MDS_STATE_RECONNECT) {
-                        send_mds_reconnect(mdsc, i);
+                        mutex_unlock(&mdsc->mutex);
+                        send_mds_reconnect(mdsc, s);
+                        mutex_lock(&mdsc->mutex);
+                }
                /*
-                 * kick requests on any mds that has gone active.
+                 * kick request on any mds that has gone active.
-                 *
-                 * kick requests on cur or forwarder: we may have sent
-                 * the request to mds1, mds1 told us it forwarded it
-                 * to mds2, but then we learn mds1 failed and can't be
-                 * sure it successfully forwarded our request before
-                 * it died.
                 */
                if (oldstate < CEPH_MDS_STATE_ACTIVE &&
                    newstate >= CEPH_MDS_STATE_ACTIVE) {
-                        pr_info("mds%d reconnect completed\n", s->s_mds);
+                        if (oldstate != CEPH_MDS_STATE_CREATING &&
-                        kick_requests(mdsc, i, 1);
+                            oldstate != CEPH_MDS_STATE_STARTING)
+                                pr_info("mds%d recovery completed\n", s->s_mds);
+                        kick_requests(mdsc, i);
                        ceph_kick_flushing_caps(mdsc, s);
                        wake_up_session_caps(s, 1);
                }
@@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
        dnamelen = dentry->d_name.len;
        len += dnamelen;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
                return;
        lease = msg->front.iov_base;
        lease->action = action;
@@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work)
                else
                        ceph_con_keepalive(&s->s_con);
                add_cap_releases(mdsc, s, -1);
-                send_cap_releases(mdsc, s);
+                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                    s->s_state == CEPH_MDS_SESSION_HUNG)
+                        send_cap_releases(mdsc, s);
                mutex_unlock(&s->s_mutex);
                ceph_put_mds_session(s);
@@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        mdsc->client = client;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+        if (mdsc->mdsmap == NULL)
+                return -ENOMEM;
        init_completion(&mdsc->safe_umount_waiters);
        init_completion(&mdsc->session_close_waiters);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        init_waitqueue_head(&mdsc->cap_flushing_wq);
        spin_lock_init(&mdsc->dentry_lru_lock);
        INIT_LIST_HEAD(&mdsc->dentry_lru);
        return 0;
 }
@@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return;
        dout("sync\n");
        mutex_lock(&mdsc->mutex);
        want_tid = mdsc->last_tid;
@@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con)
 static void peer_reset(struct ceph_connection *con)
 {
        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
-        pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+        pr_warning("mds%d closed our session\n", s->s_mds);
-               s->s_mds);
+        send_mds_reconnect(mdsc, s);
 }
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&mdsc->client->monc);
 }
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
        .get = con_get,
        .put = con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
        struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
        struct inode *r_target_inode;       /* resulting inode */
+        struct mutex r_fill_mutex;
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
@@ -213,7 +215,7 @@ struct ceph_mds_request {
        struct completion r_safe_completion;
        ceph_mds_request_callback_t r_callback;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-        bool              r_got_unsafe, r_got_safe;
+        bool              r_got_unsafe, r_got_safe, r_got_result;
        bool              r_did_prepopulate;
        u32               r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                    struct inode *inode,
                                    struct dentry *dn, int mask);
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..60b74839ebec 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
-const char *ceph_name_type_str(int t)
-{
-        switch (t) {
-        case CEPH_ENTITY_TYPE_MON: return "mon";
-        case CEPH_ENTITY_TYPE_MDS: return "mds";
-        case CEPH_ENTITY_TYPE_OSD: return "osd";
-        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-        default: return "???";
-        }
-}
 /*
 * nicely render a sockaddr as a string.
 */
@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
                ceph_msg_put(con->out_msg);
                con->out_msg = NULL;
        }
+        con->out_keepalive_pending = false;
        con->in_seq = 0;
        con->in_seq_acked = 0;
 }
@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
        clear_bit(WRITE_PENDING, &con->state);
        mutex_lock(&con->mutex);
        reset_connection(con);
+        con->peer_global_seq = 0;
        cancel_delayed_work(&con->work);
        mutex_unlock(&con->mutex);
        queue_con(con);
@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
             con->connect_seq, global_seq, proto);
-        con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+        con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
        con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
 static int process_connect(struct ceph_connection *con)
 {
-        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+        u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
-        u64 req_feat = CEPH_FEATURE_REQUIRED;
+        u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
        u64 server_feat = le64_to_cpu(con->in_reply.features);
        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
                clear_bit(CONNECTING, &con->state);
                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
                con->connect_seq++;
+                con->peer_features = server_feat;
                dout("process_connect got READY gseq %d cseq %d (%d)\n",
                     con->peer_global_seq,
                     le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
                if (skip) {
                        /* skip this message */
-                        dout("alloc_msg returned NULL, skipping message\n");
+                        dout("alloc_msg said skip message\n");
                        con->in_base_pos = -front_len - middle_len - data_len -
                                sizeof(m->footer);
                        con->in_tag = CEPH_MSGR_TAG_READY;
                        con->in_seq++;
                        return 0;
                }
-                if (IS_ERR(con->in_msg)) {
+                if (!con->in_msg) {
-                        ret = PTR_ERR(con->in_msg);
-                        con->in_msg = NULL;
                        con->error_msg =
                                "error allocating memory for incoming message";
-                        return ret;
+                        return -ENOMEM;
                }
                m = con->in_msg;
                m->front.iov_len = 0;    /* haven't read it yet */
@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
        /* if first message, set peer_name */
        if (con->peer_name.type == 0)
-                con->peer_name = msg->hdr.src.name;
+                con->peer_name = msg->hdr.src;
        con->in_seq++;
        mutex_unlock(&con->mutex);
        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
             msg, le64_to_cpu(msg->hdr.seq),
-             ENTITY_NAME(msg->hdr.src.name),
+             ENTITY_NAME(msg->hdr.src),
             le16_to_cpu(msg->hdr.type),
             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
             le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
        dout("try_write start %p state %lu nref %d\n", con, con->state,
             atomic_read(&con->nref));
-        mutex_lock(&con->mutex);
 more:
        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -1639,7 +1627,6 @@ do_next:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_write done on %p\n", con);
        return ret;
 }
@@ -1651,7 +1638,6 @@ out:
 */
 static int try_read(struct ceph_connection *con)
 {
-        struct ceph_messenger *msgr;
        int ret = -1;
        if (!con->sock)
@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
                return 0;
        dout("try_read start on %p\n", con);
-        msgr = con->msgr;
-        mutex_lock(&con->mutex);
 more:
        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1741,6 @@ more:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_read done on %p\n", con);
        return ret;
@@ -1830,6 +1812,8 @@ more:
        dout("con_work %p start, clearing QUEUED\n", con);
        clear_bit(QUEUED, &con->state);
+        mutex_lock(&con->mutex);
        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
                dout("con_work CLOSED\n");
                con_close_socket(con);
@@ -1844,11 +1828,16 @@ more:
        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
            try_read(con) < 0 ||
            try_write(con) < 0) {
+                mutex_unlock(&con->mutex);
                backoff = 1;
                ceph_fault(con);     /* error/fault path */
+                goto done_unlocked;
        }
 done:
+        mutex_unlock(&con->mutex);
+done_unlocked:
        clear_bit(BUSY, &con->state);
        dout("con->state=%lu\n", con->state);
        if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
        /* the zero page is needed if a request is "canceled" while the message
         * is being written over the socket */
-        msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
        if (!msgr->zero_page) {
                kfree(msgr);
                return ERR_PTR(-ENOMEM);
@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
        }
        /* set src+dst */
-        msg->hdr.src.name = con->msgr->inst.name;
+        msg->hdr.src = con->msgr->inst.name;
-        msg->hdr.src.addr = con->msgr->my_enc_addr;
-        msg->hdr.orig_src = msg->hdr.src;
        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
 * construct a new message with given type, size
 * the new msg has a ref count of 1.
 */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-                              int page_len, int page_off, struct page **pages)
 {
        struct ceph_msg *m;
-        m = kmalloc(sizeof(*m), GFP_NOFS);
+        m = kmalloc(sizeof(*m), flags);
        if (m == NULL)
                goto out;
        kref_init(&m->kref);
@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->hdr.version = 0;
        m->hdr.front_len = cpu_to_le32(front_len);
        m->hdr.middle_len = 0;
-        m->hdr.data_len = cpu_to_le32(page_len);
+        m->hdr.data_len = 0;
-        m->hdr.data_off = cpu_to_le16(page_off);
+        m->hdr.data_off = 0;
        m->hdr.reserved = 0;
        m->footer.front_crc = 0;
        m->footer.middle_crc = 0;
@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        /* front */
        if (front_len) {
                if (front_len > PAGE_CACHE_SIZE) {
-                        m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                        m->front.iov_base = __vmalloc(front_len, flags,
                                                      PAGE_KERNEL);
                        m->front_is_vmalloc = true;
                } else {
-                        m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                        m->front.iov_base = kmalloc(front_len, flags);
                }
                if (m->front.iov_base == NULL) {
                        pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->middle = NULL;
        /* data */
-        m->nr_pages = calc_pages_for(page_off, page_len);
+        m->nr_pages = 0;
-        m->pages = pages;
+        m->pages = NULL;
        m->pagelist = NULL;
-        dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+        dout("ceph_msg_new %p front %d\n", m, front_len);
-             m->nr_pages);
        return m;
 out2:
        ceph_msg_put(m);
 out:
-        pr_err("msg_new can't create type %d len %d\n", type, front_len);
+        pr_err("msg_new can't create type %d front %d\n", type, front_len);
-        return ERR_PTR(-ENOMEM);
+        return NULL;
 }
 /*
@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
                mutex_unlock(&con->mutex);
                msg = con->ops->alloc_msg(con, hdr, skip);
                mutex_lock(&con->mutex);
-                if (IS_ERR(msg))
+                if (!msg || *skip)
-                        return msg;
-                if (*skip)
                        return NULL;
        }
        if (!msg) {
                *skip = 0;
-                msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+                msg = ceph_msg_new(type, front_len, GFP_NOFS);
                if (!msg) {
                        pr_err("unable to allocate msg type %d len %d\n",
                               type, front_len);
-                        return ERR_PTR(-ENOMEM);
+                        return NULL;
                }
        }
        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-        if (middle_len) {
+        if (middle_len && !msg->middle) {
                ret = ceph_alloc_middle(con, msg);
                if (ret < 0) {
                        ceph_msg_put(msg);
-                        return msg;
+                        return NULL;
                }
        }
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..00a9430b1ffc 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
                                        int *skip);
 };
-extern const char *ceph_name_type_str(int t);
 /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 struct ceph_messenger {
        struct ceph_entity_inst inst;    /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
        struct ceph_entity_addr peer_addr; /* peer address */
        struct ceph_entity_name peer_name; /* peer name */
        struct ceph_entity_addr peer_addr_for_me;
+        unsigned peer_features;
        u32 connect_seq;      /* identify the most recent connection
                                 attempt for this connection, client */
        u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
        struct list_head out_queue;
        struct list_head out_sent;   /* sending or sent but unacked */
        u64 out_seq;                 /* last message queued for send */
-        u64 out_seq_sent;            /* last message sent */
        bool out_keepalive_pending;
        u64 in_seq, in_seq_acked;  /* last message received, acked */
@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-                                     int page_len, int page_off,
-                                     struct page **pages);
 extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..f6510a476e7e 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
 * resend any outstanding requests.
 */
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
 static int __validate_auth(struct ceph_mon_client *monc);
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
        monc->pending_auth = 1;
        monc->m_auth->front.iov_len = len;
        monc->m_auth->hdr.front_len = cpu_to_le32(len);
+        ceph_con_revoke(monc->con, monc->m_auth);
        ceph_msg_get(monc->m_auth);  /* keep our ref */
        ceph_con_send(monc->con, monc->m_auth);
 }
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
             monc->want_next_osdmap);
        if ((__sub_expired(monc) && !monc->sub_sent) ||
            monc->want_next_osdmap == 1) {
-                struct ceph_msg *msg;
+                struct ceph_msg *msg = monc->m_subscribe;
                struct ceph_mon_subscribe_item *i;
                void *p, *end;
-                msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
-                if (!msg)
-                        return;
                p = msg->front.iov_base;
-                end = p + msg->front.iov_len;
+                end = p + msg->front_max;
                dout("__send_subscribe to 'mdsmap' %u+\n",
                     (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
                msg->front.iov_len = p - msg->front.iov_base;
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-                ceph_con_send(monc->con, msg);
+                ceph_con_revoke(monc->con, msg);
+                ceph_con_send(monc->con, ceph_msg_get(msg));
                monc->sub_sent = jiffies | 1;  /* never 0 */
        }
@@ -353,14 +351,14 @@ out:
 /*
 * statfs
 */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
        struct ceph_mon_client *monc, u64 tid)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->statfs_request_tree.rb_node;
+        struct rb_node *n = monc->generic_request_tree.rb_node;
        while (n) {
-                req = rb_entry(n, struct ceph_mon_statfs_request, node);
+                req = rb_entry(n, struct ceph_mon_generic_request, node);
                if (tid < req->tid)
                        n = n->rb_left;
                else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
        return NULL;
 }
-static void __insert_statfs(struct ceph_mon_client *monc,
+static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_statfs_request *new)
+                            struct ceph_mon_generic_request *new)
 {
-        struct rb_node **p = &monc->statfs_request_tree.rb_node;
+        struct rb_node **p = &monc->generic_request_tree.rb_node;
        struct rb_node *parent = NULL;
-        struct ceph_mon_statfs_request *req = NULL;
+        struct ceph_mon_generic_request *req = NULL;
        while (*p) {
                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+                req = rb_entry(parent, struct ceph_mon_generic_request, node);
                if (new->tid < req->tid)
                        p = &(*p)->rb_left;
                else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
        }
        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->statfs_request_tree);
+        rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+static void release_generic_request(struct kref *kref)
+{
+        struct ceph_mon_generic_request *req =
+                container_of(kref, struct ceph_mon_generic_request, kref);
+        if (req->reply)
+                ceph_msg_put(req->reply);
+        if (req->request)
+                ceph_msg_put(req->request);
+}
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_put(&req->kref, release_generic_request);
+}
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_get(&req->kref);
+}
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+                                         struct ceph_msg_header *hdr,
+                                         int *skip)
+{
+        struct ceph_mon_client *monc = con->private;
+        struct ceph_mon_generic_request *req;
+        u64 tid = le64_to_cpu(hdr->tid);
+        struct ceph_msg *m;
+        mutex_lock(&monc->mutex);
+        req = __lookup_generic_req(monc, tid);
+        if (!req) {
+                dout("get_generic_reply %lld dne\n", tid);
+                *skip = 1;
+                m = NULL;
+        } else {
+                dout("get_generic_reply %lld got %p\n", tid, req->reply);
+                m = ceph_msg_get(req->reply);
+                /*
+                 * we don't need to track the connection reading into
+                 * this reply because we only have one open connection
+                 * at a time, ever.
+                 */
+        }
+        mutex_unlock(&monc->mutex);
+        return m;
 }
 static void handle_statfs_reply(struct ceph_mon_client *monc,
                                struct ceph_msg *msg)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-        u64 tid;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-        tid = le64_to_cpu(msg->hdr.tid);
        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        req = __lookup_statfs(monc, tid);
+        req = __lookup_generic_req(monc, tid);
        if (req) {
-                *req->buf = reply->st;
+                *(struct ceph_statfs *)req->buf = reply->st;
                req->result = 0;
+                get_generic_request(req);
        }
        mutex_unlock(&monc->mutex);
-        if (req)
+        if (req) {
                complete(&req->completion);
+                put_generic_request(req);
+        }
        return;
 bad:
-        pr_err("corrupt statfs reply, no tid\n");
+        pr_err("corrupt generic reply, no tid\n");
        ceph_msg_dump(msg);
 }
 /*
- * (re)send a statfs request
+ * Do a synchronous statfs().
 */
-static int send_statfs(struct ceph_mon_client *monc,
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-                       struct ceph_mon_statfs_request *req)
 {
-        struct ceph_msg *msg;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
+        int err;
-        dout("send_statfs tid %llu\n", req->tid);
+        req = kzalloc(sizeof(*req), GFP_NOFS);
-        msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+        if (!req)
-        if (IS_ERR(msg))
+                return -ENOMEM;
-                return PTR_ERR(msg);
-        req->request = msg;
+        kref_init(&req->kref);
-        msg->hdr.tid = cpu_to_le64(req->tid);
+        req->buf = buf;
-        h = msg->front.iov_base;
+        init_completion(&req->completion);
+        err = -ENOMEM;
+        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+        if (!req->request)
+                goto out;
+        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+        if (!req->reply)
+                goto out;
+        /* fill out request */
+        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
-        ceph_con_send(monc->con, msg);
-        return 0;
-}
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-        struct ceph_mon_statfs_request req;
-        int err;
-        req.buf = buf;
-        init_completion(&req.completion);
-        /* allocate memory for reply */
-        err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
-        if (err)
-                return err;
        /* register request */
        mutex_lock(&monc->mutex);
-        req.tid = ++monc->last_tid;
+        req->tid = ++monc->last_tid;
-        req.last_attempt = jiffies;
+        req->request->hdr.tid = cpu_to_le64(req->tid);
-        req.delay = BASE_DELAY_INTERVAL;
+        __insert_generic_request(monc, req);
-        __insert_statfs(monc, &req);
+        monc->num_generic_requests++;
-        monc->num_statfs_requests++;
        mutex_unlock(&monc->mutex);
        /* send request and wait */
-        err = send_statfs(monc, &req);
+        ceph_con_send(monc->con, ceph_msg_get(req->request));
-        if (!err)
+        err = wait_for_completion_interruptible(&req->completion);
-                err = wait_for_completion_interruptible(&req.completion);
        mutex_lock(&monc->mutex);
-        rb_erase(&req.node, &monc->statfs_request_tree);
+        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_statfs_requests--;
+        monc->num_generic_requests--;
-        ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
        mutex_unlock(&monc->mutex);
        if (!err)
-                err = req.result;
+                err = req->result;
+out:
+        kref_put(&req->kref, release_generic_request);
        return err;
 }
 /*
 * Resend pending statfs requests.
 */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct rb_node *p;
-        for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+        for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_mon_statfs_request, node);
+                req = rb_entry(p, struct ceph_mon_generic_request, node);
-                send_statfs(monc, req);
+                ceph_con_revoke(monc->con, req->request);
+                ceph_con_send(monc->con, ceph_msg_get(req->request));
        }
 }
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-        /* msg pools */
+        /* msgs */
-        err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+        err = -ENOMEM;
-                               sizeof(struct ceph_mon_subscribe_ack), 1, false);
+        monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-        if (err < 0)
+                                     sizeof(struct ceph_mon_subscribe_ack),
+                                     GFP_NOFS);
+        if (!monc->m_subscribe_ack)
                goto out_monmap;
-        err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-                                sizeof(struct ceph_mon_statfs_reply), 0, false);
+        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-        if (err < 0)
+        if (!monc->m_subscribe)
-                goto out_pool1;
+                goto out_subscribe_ack;
-        err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-        if (err < 0)
+        monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-                goto out_pool2;
+        if (!monc->m_auth_reply)
+                goto out_subscribe;
-        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
        monc->pending_auth = 0;
-        if (IS_ERR(monc->m_auth)) {
+        if (!monc->m_auth)
-                err = PTR_ERR(monc->m_auth);
+                goto out_auth_reply;
-                monc->m_auth = NULL;
-                goto out_pool3;
-        }
        monc->cur_mon = -1;
        monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->sub_sent = 0;
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-        monc->statfs_request_tree = RB_ROOT;
+        monc->generic_request_tree = RB_ROOT;
-        monc->num_statfs_requests = 0;
+        monc->num_generic_requests = 0;
        monc->last_tid = 0;
        monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->want_next_osdmap = 1;
        return 0;
-out_pool3:
+out_auth_reply:
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_auth_reply);
-out_pool2:
+out_subscribe:
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_subscribe);
-out_pool1:
+out_subscribe_ack:
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
        kfree(monc->monmap);
 out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
        ceph_auth_destroy(monc->auth);
        ceph_msg_put(monc->m_auth);
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_auth_reply);
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe);
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
        kfree(monc->monmap);
 }
@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                monc->client->msgr->inst.name.num = monc->auth->global_id;
                __send_subscribe(monc);
-                __resend_statfs(monc);
+                __resend_generic_request(monc);
        }
        mutex_unlock(&monc->mutex);
 }
@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+                m = ceph_msg_get(monc->m_subscribe_ack);
                break;
        case CEPH_MSG_STATFS_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+                return get_generic_reply(con, hdr, skip);
-                break;
        case CEPH_MSG_AUTH_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+                m = ceph_msg_get(monc->m_auth_reply);
                break;
        case CEPH_MSG_MON_MAP:
        case CEPH_MSG_MDS_MAP:
        case CEPH_MSG_OSD_MAP:
-                m = ceph_msg_new(type, front_len, 0, 0, NULL);
+                m = ceph_msg_new(type, front_len, GFP_NOFS);
                break;
        }
@@ -826,7 +867,7 @@ out:
        mutex_unlock(&monc->mutex);
 }
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
        .get = ceph_con_get,
        .put = ceph_con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
 #define _FS_CEPH_MON_CLIENT_H
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/rbtree.h>
 #include "messenger.h"
-#include "msgpool.h"
 struct ceph_client;
 struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
 };
 struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
 /*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
 };
 /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
 * to the caller
 */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-        struct ceph_statfs *buf;
+        void *buf;
        struct completion completion;
-        unsigned long last_attempt, delay; /* jiffies */
        struct ceph_msg *request;  /* original request */
+        struct ceph_msg *reply;    /* and reply */
 };
 struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
        struct delayed_work delayed_work;
        struct ceph_auth_client *auth;
-        struct ceph_msg *m_auth;
+        struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
        int pending_auth;
        bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
        struct ceph_connection *con;
        bool have_fsid;
-        /* msg pools */
+        /* pending generic requests */
-        struct ceph_msgpool msgpool_subscribe_ack;
+        struct rb_root generic_request_tree;
-        struct ceph_msgpool msgpool_statfs_reply;
+        int num_generic_requests;
-        struct ceph_msgpool msgpool_auth_reply;
-        /* pending statfs requests */
-        struct rb_root statfs_request_tree;
-        int num_statfs_requests;
        u64 last_tid;
        /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
 #include "msgpool.h"
-/*
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
- * We use msg pools to preallocate memory for messages we expect to
+{
- * receive over the wire, to avoid getting ourselves into OOM
+        struct ceph_msgpool *pool = arg;
- * conditions at unexpected times.  We take use a few different
+        void *p;
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+        p = ceph_msg_new(0, pool->front_len, gfp_mask);
+        if (!p)
+                pr_err("msgpool %s alloc failed\n", pool->name);
+        return p;
+}
-/*
+static void free_fn(void *element, void *arg)
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
 {
-        struct ceph_msg *msg;
+        ceph_msg_put(element);
-        while (pool->num < pool->min) {
-                dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-                     pool->min);
-                spin_unlock(&pool->lock);
-                msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-                spin_lock(&pool->lock);
-                if (IS_ERR(msg))
-                        return PTR_ERR(msg);
-                msg->pool = pool;
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-        }
-        while (pool->num > pool->min) {
-                msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-                dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-                     pool->min, msg);
-                list_del_init(&msg->list_head);
-                pool->num--;
-                ceph_msg_kfree(msg);
-        }
-        return 0;
 }
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-                      int front_len, int min, bool blocking)
+                      int front_len, int size, bool blocking, const char *name)
 {
-        int ret;
-        dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-        spin_lock_init(&pool->lock);
        pool->front_len = front_len;
-        INIT_LIST_HEAD(&pool->msgs);
+        pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-        pool->num = 0;
+        if (!pool->pool)
-        pool->min = min;
+                return -ENOMEM;
-        pool->blocking = blocking;
+        pool->name = name;
-        init_waitqueue_head(&pool->wait);
+        return 0;
-        spin_lock(&pool->lock);
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
 }
 void ceph_msgpool_destroy(struct ceph_msgpool *pool)
 {
-        dout("msgpool_destroy %p\n", pool);
+        mempool_destroy(pool->pool);
-        spin_lock(&pool->lock);
-        pool->min = 0;
-        __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
 }
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+                                  int front_len)
 {
-        int ret;
+        if (front_len > pool->front_len) {
+                pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-        spin_lock(&pool->lock);
+                       pool->name, front_len, pool->front_len);
-        dout("msgpool_resv %p delta %d\n", pool, delta);
-        pool->min += delta;
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
-}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-        wait_queue_t wait;
-        struct ceph_msg *msg;
-        if (front_len && front_len > pool->front_len) {
-                pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-                       pool, front_len, pool->front_len);
                WARN_ON(1);
                /* try to alloc a fresh message */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                return ceph_msg_new(0, front_len, GFP_NOFS);
-                if (!IS_ERR(msg))
-                        return msg;
-        }
-        if (!front_len)
-                front_len = pool->front_len;
-        if (pool->blocking) {
-                /* mempool_t behavior; first try to alloc */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                if (!IS_ERR(msg))
-                        return msg;
        }
-        while (1) {
+        return mempool_alloc(pool->pool, GFP_NOFS);
-                spin_lock(&pool->lock);
-                if (likely(pool->num)) {
-                        msg = list_entry(pool->msgs.next, struct ceph_msg,
-                                         list_head);
-                        list_del_init(&msg->list_head);
-                        pool->num--;
-                        dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-                             pool->num, pool->min);
-                        spin_unlock(&pool->lock);
-                        return msg;
-                }
-                pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-                       pool->min, pool->blocking ? "waiting" : "may fail");
-                spin_unlock(&pool->lock);
-                if (!pool->blocking) {
-                        WARN_ON(1);
-                        /* maybe we can allocate it now? */
-                        msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                        if (!IS_ERR(msg))
-                                return msg;
-                        pr_err("msgpool_get %p empty + alloc failed\n", pool);
-                        return ERR_PTR(-ENOMEM);
-                }
-                init_wait(&wait);
-                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-                schedule();
-                finish_wait(&pool->wait, &wait);
-        }
 }
 void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
-        spin_lock(&pool->lock);
+        /* reset msg front_len; user may have changed it */
-        if (pool->num < pool->min) {
+        msg->front.iov_len = pool->front_len;
-                /* reset msg front_len; user may have changed it */
+        msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                msg->front.iov_len = pool->front_len;
-                msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                kref_set(&msg->kref, 1);  /* retake a single ref */
+        kref_init(&msg->kref);  /* retake single ref */
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-                dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                wake_up(&pool->wait);
-        } else {
-                dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                ceph_msg_kfree(msg);
-        }
 }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MSGPOOL
 #define _FS_CEPH_MSGPOOL
+#include <linux/mempool.h>
 #include "messenger.h"
 /*
@@ -8,18 +9,15 @@
 * avoid unexpected OOM conditions.
 */
 struct ceph_msgpool {
-        spinlock_t lock;
+        const char *name;
+        mempool_t *pool;
        int front_len;          /* preallocated payload size */
-        struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-        int num, min;           /* cur, min # msgs in the pool */
-        bool blocking;
-        wait_queue_head_t wait;
 };
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                             int front_len, int size, bool blocking);
+                             int front_len, int size, bool blocking,
+                             const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
                                         int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
 #define CEPH_ENTITY_TYPE_MDS    0x02
 #define CEPH_ENTITY_TYPE_OSD    0x04
 #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
 #define CEPH_ENTITY_TYPE_AUTH   0x20
 #define CEPH_ENTITY_TYPE_ANY    0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
 /*
 * message header
 */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
        __le64 seq;       /* message seq# for this session */
        __le64 tid;       /* transaction id */
        __le16 type;      /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
        __le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
+struct ceph_msg_header {
+        __le64 seq;       /* message seq# for this session */
+        __le64 tid;       /* transaction id */
+        __le16 type;      /* message type */
+        __le16 priority;  /* priority.  higher value == higher priority */
+        __le16 version;   /* version of message encoding */
+        __le32 front_len; /* bytes in main payload */
+        __le32 middle_len;/* bytes in middle payload */
+        __le32 data_len;  /* bytes of data payload */
+        __le16 data_off;  /* sender: include full offset;
+                             receiver: mask against ~PAGE_MASK */
+        struct ceph_entity_name src;
+        __le32 reserved;
+        __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..afa7bb3895c4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
 #define OSD_OP_FRONT_LEN        4096
 #define OSD_OPREPLY_FRONT_LEN   512
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
 static int __kick_requests(struct ceph_osd_client *osdc,
                          struct ceph_osd *kickosd);
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                req = kzalloc(sizeof(*req), GFP_NOFS);
        }
        if (req == NULL)
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
        else
                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+                                   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        req->r_reply = msg;
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        if (use_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
        memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
         * should mark the osd as failed and we should find out about
         * it from an updated osd map.
         */
-        while (!list_empty(&osdc->req_lru)) {
+        while (timeout && !list_empty(&osdc->req_lru)) {
                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
                                 r_req_lru_item);
@@ -1078,6 +1078,7 @@ done:
        if (newmap)
                kick_requests(osdc, NULL);
        up_read(&osdc->map_sem);
+        wake_up(&osdc->client->auth_wq);
        return;
 bad:
@@ -1087,45 +1088,6 @@ bad:
        return;
 }
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-                         struct ceph_msg_header *hdr,
-                         struct ceph_osd_request *req,
-                         u64 tid,
-                         struct ceph_msg *m)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        int ret = -1;
-        int data_len = le32_to_cpu(hdr->data_len);
-        unsigned data_off = le16_to_cpu(hdr->data_off);
-        int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-        if (!osd)
-                return -1;
-        osdc = osd->o_osdc;
-        dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-             tid, req->r_num_pages, want);
-        if (unlikely(req->r_num_pages < want))
-                goto out;
-        m->pages = req->r_pages;
-        m->nr_pages = req->r_num_pages;
-        ret = 0; /* success */
-out:
-        BUG_ON(ret < 0 || m->nr_pages < want);
-        return ret;
-}
 /*
 * Register request, send initial attempt.
 */
@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->req_mempool)
                goto out;
-        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+                                "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                                OSD_OPREPLY_FRONT_LEN, 10, true);
+                                OSD_OPREPLY_FRONT_LEN, 10, true,
+                                "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
        return 0;
@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
                                    NULL, 0, truncate_seq, truncate_size, NULL,
                                    false, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short read due to an object boundary */
        req->r_pages = pages;
@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                    snapc, do_sync,
                                    truncate_seq, truncate_size, mtime,
                                    nofail, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short write due to an object boundary */
        req->r_pages = pages;
@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 }
 /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
 */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
                                  struct ceph_msg_header *hdr,
@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        int front = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
        u64 tid;
-        int err;
        tid = le64_to_cpu(hdr->tid);
        mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                     req->r_reply, req->r_con_filling_msg);
                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
                ceph_con_put(req->r_con_filling_msg);
+                req->r_con_filling_msg = NULL;
        }
        if (front > req->r_reply->front.iov_len) {
                pr_warning("get_reply front %d > preallocated %d\n",
                           front, (int)req->r_reply->front.iov_len);
-                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-                if (IS_ERR(m))
+                if (!m)
                        goto out;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        m = ceph_msg_get(req->r_reply);
        if (data_len > 0) {
-                err = __prepare_pages(con, hdr, req, tid, m);
+                unsigned data_off = le16_to_cpu(hdr->data_off);
-                if (err < 0) {
+                int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+                if (unlikely(req->r_num_pages < want)) {
+                        pr_warning("tid %lld reply %d > expected %d pages\n",
+                                   tid, want, m->nr_pages);
                        *skip = 1;
                        ceph_msg_put(m);
-                        m = ERR_PTR(err);
+                        m = NULL;
+                        goto out;
                }
+                m->pages = req->r_pages;
+                m->nr_pages = req->r_num_pages;
        }
        *skip = 0;
        req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_OSD_MAP:
-                return ceph_msg_new(type, front, 0, 0, NULL);
+                return ceph_msg_new(type, front, GFP_NOFS);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&osdc->client->monc);
 }
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
        .get = get_osd_con,
        .put = put_osd_con,
        .dispatch = dispatch,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
 static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 {
-        struct page *page = alloc_page(GFP_NOFS);
+        struct page *page = __page_cache_alloc(GFP_NOFS);
        if (!page)
                return -ENOMEM;
        pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
        __le64 snap_seq;          /* seq for per-pool snapshot */
        __le32 snap_epoch;        /* epoch of last snap */
        __le32 num_snaps;
-        __le32 num_removed_snap_intervals;
+        __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-        __le64 uid;
+        __le64 auid;               /* who owns the pg */
 } __attribute__ ((packed));
 /*
@@ -208,6 +208,7 @@ enum {
        /* read */
        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+        CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
        /* write */
        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
 #define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/* xattr comparison */
+enum {
+        CEPH_OSD_CMPXATTR_OP_NOP = 0,
+        CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+        CEPH_OSD_CMPXATTR_OP_NE  = 2,
+        CEPH_OSD_CMPXATTR_OP_GT  = 3,
+        CEPH_OSD_CMPXATTR_OP_GTE = 4,
+        CEPH_OSD_CMPXATTR_OP_LT  = 5,
+        CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+enum {
+        CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+        CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
 /*
 * an individual object operation.  each may be accompanied by some data
 * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
                struct {
                        __le32 name_len;
                        __le32 value_len;
+                        __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                        __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
                } __attribute__ ((packed)) xattr;
                struct {
                        __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
 {
        struct inode *inode = &ci->vfs_inode;
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        BUG_ON(capsnap->writing);
        capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..7c663d9b9f81 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
 #include "decode.h"
 #include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_syncfs(struct super_block *sb, int wait)
 {
        dout("sync_fs %d\n", wait);
-        ceph_osdc_sync(&ceph_client(sb)->osdc);
+        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-        ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
        dout("sync_fs %d done\n", wait);
        return 0;
 }
+static int default_congestion_kb(void)
+{
+        int congestion_kb;
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
+}
 /**
 * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",nocrc");
        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
+        if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+                seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+        if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+                seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+        if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+                seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+        if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+                seq_printf(m, ",osdkeepalivetimeout=%d",
+                         args->osd_keepalive_timeout);
+        if (args->wsize)
+                seq_printf(m, ",wsize=%d", args->wsize);
+        if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+                seq_printf(m, ",rsize=%d", args->rsize);
+        if (args->congestion_kb != default_congestion_kb())
+                seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+        if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_min=%d",
+                         args->caps_wanted_delay_min);
+        if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_max=%d",
+                           args->caps_wanted_delay_max);
+        if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+                seq_printf(m, ",cap_release_safety=%d",
+                           args->cap_release_safety);
+        if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+                seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+        if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+                seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
        if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
        inode_init_once(&ci->vfs_inode);
 }
-static int default_congestion_kb(void)
-{
-        int congestion_kb;
-        /*
-         * Copied from NFS
-         *
-         * congestion size, scale with available memory.
-         *
-         *  64MB:    8192k
-         * 128MB:   11585k
-         * 256MB:   16384k
-         * 512MB:   23170k
-         *   1GB:   32768k
-         *   2GB:   46340k
-         *   4GB:   65536k
-         *   8GB:   92681k
-         *  16GB:  131072k
-         *
-         * This allows larger machines to have larger/more transfers.
-         * Limit the default to 256M
-         */
-        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-        if (congestion_kb > 256*1024)
-                congestion_kb = 256*1024;
-        return congestion_kb;
-}
 static int __init init_caches(void)
 {
        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
        Opt_osd_idle_ttl,
        Opt_caps_wanted_delay_min,
        Opt_caps_wanted_delay_max,
+        Opt_cap_release_safety,
        Opt_readdir_max_entries,
+        Opt_readdir_max_bytes,
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+        {Opt_cap_release_safety, "cap_release_safety=%d"},
        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
        {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-        args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+        args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        args->max_readdir = 1024;
+        args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        args->congestion_kb = default_congestion_kb();
        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                case Opt_readdir_max_entries:
                        args->max_readdir = intval;
                        break;
+                case Opt_readdir_max_bytes:
+                        args->max_readdir_bytes = intval;
+                        break;
                case Opt_congestion_kb:
                        args->congestion_kb = intval;
                        break;
@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 /*
 * true if we have the mon map (and have thus joined the cluster)
 */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
 {
-        return client->monc.monmap && client->monc.monmap->epoch;
+        return client->monc.monmap && client->monc.monmap->epoch &&
+               client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 /*
@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
        if (err < 0)
                goto out;
-        while (!have_mon_map(client)) {
+        while (!have_mon_and_osd_map(client)) {
                err = -EIO;
                if (timeout && time_after_eq(jiffies, started + timeout))
                        goto out;
@@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
                /* wait */
                dout("mount waiting for mon_map\n");
                err = wait_event_interruptible_timeout(client->auth_wq,
-                               have_mon_map(client) || (client->auth_err < 0),
+                       have_mon_and_osd_map(client) || (client->auth_err < 0),
-                               timeout);
+                       timeout);
                if (err == -EINTR || err == -ERESTARTSYS)
                        goto out;
                if (client->auth_err < 0) {
@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
 * construct our own bdi so we can control readahead, etc.
 */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
        int err;
@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
                client->backing_dev_info.ra_pages =
                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
-        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+                           atomic_long_inc_return(&bdi_seq));
        if (!err)
                sb->s_bdi = &client->backing_dev_info;
        return err;
@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                goto out;
        }
-        if (ceph_client(sb) != client) {
+        if (ceph_sb_to_client(sb) != client) {
                ceph_destroy_client(client);
-                client = ceph_client(sb);
+                client = ceph_sb_to_client(sb);
                dout("get_sb got existing client %p\n", client);
        } else {
                dout("get_sb using new client %p\n", client);
@@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 out_splat:
        ceph_mdsc_close_sessions(&client->mdsc);
-        up_write(&sb->s_umount);
+        deactivate_locked_super(sb);
-        deactivate_super(sb);
        goto out_final;
 out:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..3725c9ee9d08 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,25 @@
 struct ceph_mount_args {
        int sb_flags;
+        int flags;
+        struct ceph_fsid fsid;
+        struct ceph_entity_addr my_addr;
        int num_mon;
        struct ceph_entity_addr *mon_addr;
-        int flags;
        int mount_timeout;
        int osd_idle_ttl;
-        int caps_wanted_delay_min, caps_wanted_delay_max;
-        struct ceph_fsid fsid;
-        struct ceph_entity_addr my_addr;
-        int wsize;
-        int rsize;            /* max readahead */
-        int max_readdir;      /* max readdir size */
-        int congestion_kb;      /* max readdir size */
        int osd_timeout;
        int osd_keepalive_timeout;
+        int wsize;
+        int rsize;            /* max readahead */
+        int congestion_kb;    /* max writeback in flight */
+        int caps_wanted_delay_min, caps_wanted_delay_max;
+        int cap_release_safety;
+        int max_readdir;       /* max readdir result (entires) */
+        int max_readdir_bytes; /* max readdir result (bytes) */
        char *snapdir_name;   /* default ".snap" */
        char *name;
        char *secret;
-        int cap_release_safety;
 };
 /*
@@ -80,13 +81,14 @@ struct ceph_mount_args {
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
 #define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 /*
 * Delay telling the MDS we no longer want caps, in case we reopen
 * the file.  Delay a minimum amount of time, even if we send a cap
@@ -96,6 +98,7 @@ struct ceph_mount_args {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 /* mount state */
 enum {
@@ -160,12 +163,6 @@ struct ceph_client {
 #endif
 };
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
 /*
 * File i/o capability.  This tracks shared state with the metadata
 * server that allows us to cache or writeback attributes or to read
@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 /*
 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
 static bool ceph_is_valid_xattr(const char *name)
 {
-        return !strncmp(name, XATTR_SECURITY_PREFIX,
+        return !strncmp(name, "ceph.", 5) ||
+               !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-        { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+        { true, "ceph.dir.entries", ceph_vxattrcb_entries},
-        { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+        { true, "ceph.dir.files", ceph_vxattrcb_files},
-        { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+        { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-        { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+        { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-        { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+        { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-        { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+        { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-        { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+        { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-        { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+        { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
        { true, NULL, NULL }
 };
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-        { true, "user.ceph.layout", ceph_vxattrcb_layout},
+        { true, "ceph.layout", ceph_vxattrcb_layout},
        { NULL, NULL }
 };
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
                ci->i_xattrs.names_size -= xattr->name_len;
                ci->i_xattrs.vals_size -= xattr->val_len;
        }
-        if (!xattr) {
-                pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-                       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-                       xattr->val);
-                return -ENOMEM;
-        }
        ci->i_xattrs.names_size += name_len;
        ci->i_xattrs.vals_size += val_len;
        if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
             ci->i_xattrs.version, ci->i_xattrs.index_version);
        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-            (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto list_xattr;
        } else {
                spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                              const char *value, size_t size, int flags)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                        return -ENOMEM;
                err = -ENOMEM;
                for (i = 0; i < nr_pages; i++) {
-                        pages[i] = alloc_page(GFP_NOFS);
+                        pages[i] = __page_cache_alloc(GFP_NOFS);
                        if (!pages[i]) {
                                nr_pages = i;
                                goto out;
@@ -779,7 +774,7 @@ out:
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct ceph_mds_client *mdsc = &client->mdsc;
        struct inode *inode = dentry->d_inode;
        struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc52..7196077b1688 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
+        err = vfs_fsync(host_file, datasync);
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
 /*
 * Pioctl operations for Coda.
- * Original version: (C) 1996 Peter Braam 
+ * Original version: (C) 1996 Peter Braam
 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
 *
 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include <linux/smp_lock.h>
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data);
+                        unsigned long user_data);
 /* exported from this file */
-const struct inode_operations coda_ioctl_inode_operations =
+const struct inode_operations coda_ioctl_inode_operations = {
-{
        .permission     = coda_ioctl_permission,
        .setattr        = coda_setattr,
 };
 const struct file_operations coda_ioctl_operations = {
        .owner          = THIS_MODULE,
-        .ioctl          = coda_pioctl,
+        .unlocked_ioctl = coda_pioctl,
 };
 /* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data)
+                        unsigned long user_data)
 {
        struct path path;
-        int error;
+        int error;
        struct PioctlData data;
-        struct inode *target_inode = NULL;
+        struct inode *inode = filp->f_dentry->d_inode;
-        struct coda_inode_info *cnp;
+        struct inode *target_inode = NULL;
+        struct coda_inode_info *cnp;
-        /* get the Pioctl data arguments from user space */
+        lock_kernel();
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-            return -EINVAL;
+        /* get the Pioctl data arguments from user space */
-        }
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-       
+                error = -EINVAL;
-        /* 
+                goto out;
-         * Look up the pathname. Note that the pathname is in 
-         * user memory, and namei takes care of this
-         */
-        if (data.follow) {
-                error = user_path(data.path, &path);
-        } else {
-                error = user_lpath(data.path, &path);
        }
-                
-        if ( error ) {
+        /*
-                return error;
+         * Look up the pathname. Note that the pathname is in
-        } else {
+         * user memory, and namei takes care of this
+         */
+        if (data.follow)
+                error = user_path(data.path, &path);
+        else
+                error = user_lpath(data.path, &path);
+        if (error)
+                goto out;
+        else
                target_inode = path.dentry->d_inode;
-        }
-        
        /* return if it is not a Coda inode */
-        if ( target_inode->i_sb != inode->i_sb ) {
+        if (target_inode->i_sb != inode->i_sb) {
                path_put(&path);
-                return  -EINVAL;
+                error = -EINVAL;
+                goto out;
        }
        /* now proceed to make the upcall */
-        cnp = ITOC(target_inode);
+        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
        path_put(&path);
-        return error;
-}
+out:
+        unlock_kernel();
+        return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        return mask;
 }
-static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
-                            unsigned int cmd, unsigned long arg)
 {
        unsigned int data;
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
        .read           = coda_psdev_read,
        .write          = coda_psdev_write,
        .poll           = coda_psdev_poll,
-        .ioctl          = coda_psdev_ioctl,
+        .unlocked_ioctl = coda_psdev_ioctl,
        .open           = coda_psdev_open,
        .release        = coda_psdev_release,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..d96047b4a633 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
 */
 static void prune_dcache(int count)
 {
-        struct super_block *sb;
+        struct super_block *sb, *n;
        int w_count;
        int unused = dentry_stat.nr_unused;
        int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
        if (unused == 0 || count == 0)
                return;
        spin_lock(&dcache_lock);
-restart:
        if (count >= unused)
                prune_ratio = 1;
        else
                prune_ratio = unused / count;
        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_nr_dentry_unused == 0)
                        continue;
                sb->s_count++;
@@ -590,14 +591,10 @@ restart:
                }
                spin_lock(&sb_lock);
                count -= pruned;
-                /*
+                __put_super(sb);
-                 * restart only when sb is no longer on the list and
+                /* more work left to do? */
-                 * we have more work to do.
+                if (count <= 0)
-                 */
+                        break;
-                if (__put_super_and_need_restart(sb) && count > 0) {
-                        spin_unlock(&sb_lock);
-                        goto restart;
-                }
        }
        spin_unlock(&sb_lock);
        spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
        spin_lock(&dentry->d_lock);
        isdir = S_ISDIR(dentry->d_inode->i_mode);
        if (atomic_read(&dentry->d_count) == 1) {
+                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_iput(dentry);
                fsnotify_nameremove(dentry, isdir);
                return;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c0..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
                s->s_flags |= MS_ACTIVE;
        }
-        simple_set_mnt(mnt, s);
        memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
        error = mknod_ptmx(s);
        if (error)
-                goto out_dput;
+                goto out_undo_sget;
-        return 0;
+        simple_set_mnt(mnt, s);
-out_dput:
+        return 0;
-        dput(s->s_root); /* undo dget() in simple_set_mnt() */
 out_undo_sget:
        deactivate_locked_super(s);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 17903b491298..031dbe3a15ca 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -733,10 +733,7 @@ static void lkb_add_ordered(struct list_head *new, struct list_head *head,
                if (lkb->lkb_rqmode < mode)
                        break;
-        if (!lkb)
+        __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
-                list_add_tail(new, head);
-        else
-                __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
 }
 /* add/remove lkb to rsb's grant/convert/wait queue */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 8b6e73c47435..b6272853130c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -215,6 +215,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+                lkb->lkb_ast_first = type;
                wake_up_interruptible(&proc->wait);
        }
        if (type == AST_COMP && (ast_type & AST_COMP))
@@ -223,7 +224,6 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
        if (eol) {
-                lkb->lkb_ast_type &= ~AST_BAST;
                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
        }
@@ -706,7 +706,7 @@ static int device_close(struct inode *inode, struct file *file)
 }
 static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
-                               int bmode, char __user *buf, size_t count)
+                               int mode, char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
        struct dlm_lock_result32 result32;
@@ -733,7 +733,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
        if (type == AST_BAST) {
                result.user_astaddr = ua->bastaddr;
                result.user_astparam = ua->bastparam;
-                result.bast_mode = bmode;
+                result.bast_mode = mode;
        } else {
                result.user_astaddr = ua->castaddr;
                result.user_astparam = ua->castparam;
@@ -801,7 +801,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        struct dlm_user_proc *proc = file->private_data;
        struct dlm_lkb *lkb;
        DECLARE_WAITQUEUE(wait, current);
-        int error, type=0, bmode=0, removed = 0;
+        int error = 0, removed;
+        int ret_type, ret_mode;
+        int bastmode, castmode, do_bast, do_cast;
        if (count == sizeof(struct dlm_device_version)) {
                error = copy_version_to_user(buf, count);
@@ -820,6 +822,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 #endif
                return -EINVAL;
+ try_another:
        /* do we really need this? can a read happen after a close? */
        if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
                return -EINVAL;
@@ -855,13 +859,55 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
-        if (lkb->lkb_ast_type & AST_COMP) {
+        removed = 0;
-                lkb->lkb_ast_type &= ~AST_COMP;
+        ret_type = 0;
-                type = AST_COMP;
+        ret_mode = 0;
-        } else if (lkb->lkb_ast_type & AST_BAST) {
+        do_bast = lkb->lkb_ast_type & AST_BAST;
-                lkb->lkb_ast_type &= ~AST_BAST;
+        do_cast = lkb->lkb_ast_type & AST_COMP;
-                type = AST_BAST;
+        bastmode = lkb->lkb_bastmode;
-                bmode = lkb->lkb_bastmode;
+        castmode = lkb->lkb_castmode;
+        /* when both are queued figure out which to do first and
+           switch first so the other goes in the next read */
+        if (do_cast && do_bast) {
+                if (lkb->lkb_ast_first == AST_COMP) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = AST_BAST;
+                } else {
+                        ret_type = AST_BAST;
+                        ret_mode = bastmode;
+                        lkb->lkb_ast_type &= ~AST_BAST;
+                        lkb->lkb_ast_first = AST_COMP;
+                }
+        } else {
+                ret_type = lkb->lkb_ast_first;
+                ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
+                lkb->lkb_ast_type &= ~ret_type;
+                lkb->lkb_ast_first = 0;
+        }
+        /* if we're doing a bast but the bast is unnecessary, then
+           switch to do nothing or do a cast if that was needed next */
+        if ((ret_type == AST_BAST) &&
+            dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+                ret_type = 0;
+                ret_mode = 0;
+                if (do_cast) {
+                        ret_type = AST_COMP;
+                        ret_mode = castmode;
+                        lkb->lkb_ast_type &= ~AST_COMP;
+                        lkb->lkb_ast_first = 0;
+                }
+        }
+        if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+                log_print("device_read %x ast_first %x ast_type %x",
+                          lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
        }
        if (!lkb->lkb_ast_type) {
@@ -870,15 +916,29 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        }
        spin_unlock(&proc->asts_spin);
-        error = copy_result_to_user(lkb->lkb_ua,
+        if (ret_type) {
-                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                error = copy_result_to_user(lkb->lkb_ua,
-                                type, bmode, buf, count);
+                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                ret_type, ret_mode, buf, count);
+                if (ret_type == AST_COMP)
+                        lkb->lkb_castmode_done = castmode;
+                if (ret_type == AST_BAST)
+                        lkb->lkb_bastmode_done = bastmode;
+        }
        /* removes reference for the proc->asts lists added by
           dlm_user_add_ast() and may result in the lkb being freed */
        if (removed)
                dlm_put_lkb(lkb);
+        /* the bast that was queued was eliminated (see unnecessary above),
+           leaving nothing to return */
+        if (!ret_type)
+                goto try_another;
        return error;
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb)
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
        struct inode *inode, *toput_inode = NULL;
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
        iput(toput_inode);
 }
-static void drop_pagecache(void)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        drop_pagecache_sb(sb);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-}
 static void drop_slab(void)
 {
        int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
        proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (write) {
                if (sysctl_drop_caches & 1)
-                        drop_pagecache();
+                        iterate_supers(drop_pagecache_sb, NULL);
                if (sysctl_drop_caches & 2)
                        drop_slab();
        }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f00..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
                                      struct page *page_for_lower,
                                      size_t offset_in_page, size_t size);
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
-                   size_t size);
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
                        struct inode *ecryptfs_inode);
 int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
                                     pgoff_t page_index,
                                     size_t offset_in_page, size_t size,
                                     struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
 int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
 int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
                                 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5ebf..3bdddbcc785f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        return vfs_fsync(ecryptfs_file_to_lower(file),
+        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
-                         ecryptfs_dentry_to_lower(dentry),
-                         datasync);
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affac..65dee2f336ae 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
 static int grow_file(struct dentry *ecryptfs_dentry)
 {
        struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
-        struct file fake_file;
-        struct ecryptfs_file_info tmp_file_info;
        char zero_virt[] = { 0x00 };
        int rc = 0;
-        memset(&fake_file, 0, sizeof(fake_file));
+        rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
-        fake_file.f_path.dentry = ecryptfs_dentry;
-        memset(&tmp_file_info, 0, sizeof(tmp_file_info));
-        ecryptfs_set_file_private(&fake_file, &tmp_file_info);
-        ecryptfs_set_file_lower(
-                &fake_file,
-                ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
-        rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
        i_size_write(ecryptfs_inode, 0);
        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
        ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 {
        int rc = 0;
        struct inode *inode = dentry->d_inode;
-        struct dentry *lower_dentry;
-        struct file fake_ecryptfs_file;
        struct ecryptfs_crypt_stat *crypt_stat;
        loff_t i_size = i_size_read(inode);
        loff_t lower_size_before_truncate;
@@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                goto out;
        }
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
-        /* Set up a fake ecryptfs file, this is used to interface with
-         * the file in the underlying filesystem so that the
-         * truncation has an effect there as well. */
-        memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
-        fake_ecryptfs_file.f_path.dentry = dentry;
-        /* Released at out_free: label */
-        ecryptfs_set_file_private(&fake_ecryptfs_file,
-                                  kmem_cache_alloc(ecryptfs_file_info_cache,
-                                                   GFP_KERNEL));
-        if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
-                rc = -ENOMEM;
-                goto out;
-        }
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        ecryptfs_set_file_lower(
-                &fake_ecryptfs_file,
-                ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
        /* Switch on growing or shrinking file */
        if (ia->ia_size > i_size) {
                char zero[] = { 0x00 };
@@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                 * this triggers code that will fill in 0's throughout
                 * the intermediate portion of the previous end of the
                 * file and the new and of the file */
-                rc = ecryptfs_write(&fake_ecryptfs_file, zero,
+                rc = ecryptfs_write(inode, zero,
                                    (ia->ia_size - 1), 1);
        } else { /* ia->ia_size < i_size_read(inode) */
                /* We're chopping off all the pages down to the page
@@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
                        rc = vmtruncate(inode, ia->ia_size);
                        if (rc)
-                                goto out_free;
+                                goto out;
                        lower_ia->ia_size = ia->ia_size;
                        lower_ia->ia_valid |= ATTR_SIZE;
-                        goto out_free;
+                        goto out;
                }
                if (num_zeros) {
                        char *zeros_virt;
@@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                        zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
                        if (!zeros_virt) {
                                rc = -ENOMEM;
-                                goto out_free;
+                                goto out;
                        }
-                        rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
+                        rc = ecryptfs_write(inode, zeros_virt,
                                            ia->ia_size, num_zeros);
                        kfree(zeros_virt);
                        if (rc) {
                                printk(KERN_ERR "Error attempting to zero out "
                                       "the remainder of the end page on "
                                       "reducing truncate; rc = [%d]\n", rc);
-                                goto out_free;
+                                goto out;
                        }
                }
                vmtruncate(inode, ia->ia_size);
@@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                        printk(KERN_ERR "Problem with "
                               "ecryptfs_write_inode_size_to_metadata; "
                               "rc = [%d]\n", rc);
-                        goto out_free;
+                        goto out;
                }
                /* We are reducing the size of the ecryptfs file, and need to
                 * know if we need to reduce the size of the lower file. */
@@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                } else
                        lower_ia->ia_valid &= ~ATTR_SIZE;
        }
-out_free:
-        if (ecryptfs_file_to_private(&fake_ecryptfs_file))
-                kmem_cache_free(ecryptfs_file_info_cache,
-                                ecryptfs_file_to_private(&fake_ecryptfs_file));
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25e..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
 *
 * Returns zero on success; non-zero on error
 */
-static int ecryptfs_parse_options(struct super_block *sb, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 {
        char *p;
        int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        int fn_cipher_key_bytes;
        int fn_cipher_key_bytes_set = 0;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+                &sbi->mount_crypt_stat;
        substring_t args[MAX_OPT_ARGS];
        int token;
        char *sig_src;
@@ -483,68 +483,7 @@ out:
 }
 struct kmem_cache *ecryptfs_sb_info_cache;
+static struct file_system_type ecryptfs_fs_type;
-/**
- * ecryptfs_fill_super
- * @sb: The ecryptfs super block
- * @raw_data: The options passed to mount
- * @silent: Not used but required by function prototype
- *
- * Sets up what we can of the sb, rest is done in ecryptfs_read_super
- *
- * Returns zero on success; non-zero otherwise
- */
-static int
-ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
-        struct ecryptfs_sb_info *esi;
-        int rc = 0;
-        /* Released in ecryptfs_put_super() */
-        ecryptfs_set_superblock_private(sb,
-                                        kmem_cache_zalloc(ecryptfs_sb_info_cache,
-                                                         GFP_KERNEL));
-        esi = ecryptfs_superblock_to_private(sb);
-        if (!esi) {
-                ecryptfs_printk(KERN_WARNING, "Out of memory\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
-        if (rc)
-                goto out;
-        sb->s_bdi = &esi->bdi;
-        sb->s_op = &ecryptfs_sops;
-        /* Released through deactivate_super(sb) from get_sb_nodev */
-        sb->s_root = d_alloc(NULL, &(const struct qstr) {
-                             .hash = 0,.name = "/",.len = 1});
-        if (!sb->s_root) {
-                ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        sb->s_root->d_op = &ecryptfs_dops;
-        sb->s_root->d_sb = sb;
-        sb->s_root->d_parent = sb->s_root;
-        /* Released in d_release when dput(sb->s_root) is called */
-        /* through deactivate_super(sb) from get_sb_nodev() */
-        ecryptfs_set_dentry_private(sb->s_root,
-                                    kmem_cache_zalloc(ecryptfs_dentry_info_cache,
-                                                     GFP_KERNEL));
-        if (!ecryptfs_dentry_to_private(sb->s_root)) {
-                ecryptfs_printk(KERN_ERR,
-                                "dentry_info_cache alloc failed\n");
-                rc = -ENOMEM;
-                goto out;
-        }
-        rc = 0;
-out:
-        /* Should be able to rely on deactivate_super called from
-         * get_sb_nodev */
-        return rc;
-}
 /**
 * ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
                ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
                goto out;
        }
+        if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+                rc = -EINVAL;
+                printk(KERN_ERR "Mount on filesystem of type "
+                        "eCryptfs explicitly disallowed due to "
+                        "known incompatibilities\n");
+                goto out_free;
+        }
        ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
        sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
 * @dev_name: The path to mount over
 * @raw_data: The options passed into the kernel
 *
- * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * The whole ecryptfs_get_sb process is broken into 3 functions:
 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
- *                        with as much information as it can before needing
- *                        the lower filesystem.
 * ecryptfs_read_super(): this accesses the lower filesystem and uses
 *                        ecryptfs_interpose to perform most of the linking
 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *raw_data,
                        struct vfsmount *mnt)
 {
+        struct super_block *s;
+        struct ecryptfs_sb_info *sbi;
+        struct ecryptfs_dentry_info *root_info;
+        const char *err = "Getting sb failed";
        int rc;
-        struct super_block *sb;
-        rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
+        sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
-        if (rc < 0) {
+        if (!sbi) {
-                printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+                rc = -ENOMEM;
                goto out;
        }
-        sb = mnt->mnt_sb;
-        rc = ecryptfs_parse_options(sb, raw_data);
+        rc = ecryptfs_parse_options(sbi, raw_data);
        if (rc) {
-                printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
+                err = "Error parsing options";
-                goto out_abort;
+                goto out;
+        }
+        s = sget(fs_type, NULL, set_anon_super, NULL);
+        if (IS_ERR(s)) {
+                rc = PTR_ERR(s);
+                goto out;
        }
-        rc = ecryptfs_read_super(sb, dev_name);
+        s->s_flags = flags;
+        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
        if (rc) {
-                printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
+                deactivate_locked_super(s);
-                goto out_abort;
+                goto out;
        }
-        goto out;
-out_abort:
+        ecryptfs_set_superblock_private(s, sbi);
-        dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
+        s->s_bdi = &sbi->bdi;
-        deactivate_locked_super(sb);
+        /* ->kill_sb() will take care of sbi after that point */
+        sbi = NULL;
+        s->s_op = &ecryptfs_sops;
+        rc = -ENOMEM;
+        s->s_root = d_alloc(NULL, &(const struct qstr) {
+                             .hash = 0,.name = "/",.len = 1});
+        if (!s->s_root) {
+                deactivate_locked_super(s);
+                goto out;
+        }
+        s->s_root->d_op = &ecryptfs_dops;
+        s->s_root->d_sb = s;
+        s->s_root->d_parent = s->s_root;
+        root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+        if (!root_info) {
+                deactivate_locked_super(s);
+                goto out;
+        }
+        /* ->kill_sb() will take care of root_info */
+        ecryptfs_set_dentry_private(s->s_root, root_info);
+        s->s_flags |= MS_ACTIVE;
+        rc = ecryptfs_read_super(s, dev_name);
+        if (rc) {
+                deactivate_locked_super(s);
+                err = "Reading sb failed";
+                goto out;
+        }
+        simple_set_mnt(mnt, s);
+        return 0;
 out:
+        if (sbi) {
+                ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+                kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+        }
+        printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
        return rc;
 }
@@ -633,11 +624,16 @@ out:
 * @sb: The ecryptfs super block
 *
 * Used to bring the superblock down and free the private data.
- * Private data is free'd in ecryptfs_put_super()
 */
 static void ecryptfs_kill_block_super(struct super_block *sb)
 {
-        generic_shutdown_super(sb);
+        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        kill_anon_super(sb);
+        if (!sb_info)
+                return;
+        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+        bdi_destroy(&sb_info->bdi);
+        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 }
 static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68c..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
 * Returns locked and up-to-date page (if ok), with increased
 * refcnt.
 */
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index)
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 {
-        struct dentry *dentry;
+        struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
-        struct inode *inode;
-        struct address_space *mapping;
-        struct page *page;
-        dentry = file->f_path.dentry;
-        inode = dentry->d_inode;
-        mapping = inode->i_mapping;
-        page = read_mapping_page(mapping, index, (void *)file);
        if (!IS_ERR(page))
                lock_page(page);
        return page;
@@ -198,7 +190,7 @@ out:
 static int ecryptfs_readpage(struct file *file, struct page *page)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
        int rc = 0;
        if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
        if (!PageUptodate(page)) {
                struct ecryptfs_crypt_stat *crypt_stat =
-                        &ecryptfs_inode_to_private(
+                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
-                                file->f_path.dentry->d_inode)->crypt_stat;
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
                    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
        unsigned to = from + copied;
        struct inode *ecryptfs_inode = mapping->host;
        struct ecryptfs_crypt_stat *crypt_stat =
-                &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        int rc;
        if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 /**
 * ecryptfs_write
- * @ecryptfs_file: The eCryptfs file into which to write
+ * @ecryptfs_inode: The eCryptfs file into which to write
 * @data: Virtual address where data to write is located
 * @offset: Offset in the eCryptfs file at which to begin writing the
 *          data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
                   size_t size)
 {
        struct page *ecryptfs_page;
        struct ecryptfs_crypt_stat *crypt_stat;
-        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        char *ecryptfs_page_virt;
        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
        loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
                        if (num_bytes > total_remaining_zeros)
                                num_bytes = total_remaining_zeros;
                }
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
                                                         ecryptfs_page_idx);
                if (IS_ERR(ecryptfs_page)) {
                        rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 int ecryptfs_read(char *data, loff_t offset, size_t size,
                  struct file *ecryptfs_file)
 {
+        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
        struct page *ecryptfs_page;
        char *ecryptfs_page_virt;
-        loff_t ecryptfs_file_size =
+        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-                i_size_read(ecryptfs_file->f_dentry->d_inode);
        loff_t data_offset = 0;
        loff_t pos;
        int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
                if (num_bytes > total_remaining_bytes)
                        num_bytes = total_remaining_bytes;
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
                                                         ecryptfs_page_idx);
                if (IS_ERR(ecryptfs_page)) {
                        rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d231..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
 }
 /**
- * ecryptfs_put_super
- * @sb: Pointer to the ecryptfs super block
- *
- * Final actions when unmounting a file system.
- * This will handle deallocation and release of our private data.
- */
-static void ecryptfs_put_super(struct super_block *sb)
-{
-        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
-        lock_kernel();
-        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
-        bdi_destroy(&sb_info->bdi);
-        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
-        ecryptfs_set_superblock_private(sb, NULL);
-        unlock_kernel();
-}
-/**
 * ecryptfs_statfs
 * @sb: The ecryptfs super block
 * @buf: The struct kstatfs to fill in with stats
@@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
        .alloc_inode = ecryptfs_alloc_inode,
        .destroy_inode = ecryptfs_destroy_inode,
        .drop_inode = generic_delete_inode,
-        .put_super = ecryptfs_put_super,
        .statfs = ecryptfs_statfs,
        .remount_fs = NULL,
        .clear_inode = ecryptfs_clear_inode,
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..9badbc0bfb1d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
+        BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
-        vma->vm_flags = VM_STACK_FLAGS;
+        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
+        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
                        goto out_unlock;
        }
+        /* mprotect_fixup is overkill to remove the temporary stack flags */
+        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
        de->inode_no = cpu_to_le64(parent->i_ino);
        memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
        exofs_set_de_type(de, inode);
-        kunmap_atomic(page, KM_USER0);
+        kunmap_atomic(kaddr, KM_USER0);
        err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
        page_cache_release(page);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93e..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+        EXOFS_DBGMSG("page 0x%lx\n", page->index);
+        WARN_ON(1);
+        return try_to_free_buffers(page);
+}
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+        EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+        WARN_ON(1);
+        block_invalidatepage(page, offset);
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
        .write_end      = exofs_write_end,
+        .releasepage    = exofs_releasepage,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+        .invalidatepage = exofs_invalidatepage,
+        /* Not implemented Yet */
+        .bmap           = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+        .direct_IO      = NULL, /* TODO: Should be trivial to do */
+        /* With these NULL has special meaning or default is not exported */
+        .sync_page      = NULL,
+        .get_xip_mem    = NULL,
+        .migratepage    = NULL,
+        .launder_page   = NULL,
+        .is_partially_uptodate = NULL,
+        .error_remove_page = NULL,
 };
 /******************************************************************************
@@ -1123,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
        sb->s_dirt = 1;
-        inode->i_uid = current->cred->fsuid;
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else {
-                inode->i_gid = current->cred->fsgid;
-        }
-        inode->i_mode = mode;
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..ca7e2a0ed98a 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext2_xattr_acl_access_handler = {
+const struct xattr_handler ext2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
        .set    = ext2_xattr_set_acl,
 };
-struct xattr_handler ext2_xattr_acl_default_handler = {
+const struct xattr_handler ext2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 3cf038c055d7..e8766a396776 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1332,6 +1332,12 @@ retry_alloc:
                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
                /*
+                 * skip this group (and avoid loading bitmap) if there
+                 * are no free blocks
+                 */
+                if (!free_blocks)
+                        continue;
+                /*
                 * skip this group if the number of
                 * free blocks is less than half of the reservation
                 * window size.
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad7d572ee8dc..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -106,7 +106,7 @@ void ext2_free_inode (struct inode * inode)
        struct super_block * sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
-        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *bitmap_bh;
        unsigned long block_group;
        unsigned long bit;
        struct ext2_super_block * es;
@@ -135,14 +135,13 @@ void ext2_free_inode (struct inode * inode)
            ino > le32_to_cpu(es->s_inodes_count)) {
                ext2_error (sb, "ext2_free_inode",
                            "reserved or nonexistent inode %lu", ino);
-                goto error_return;
+                return;
        }
        block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
-        brelse(bitmap_bh);
        bitmap_bh = read_inode_bitmap(sb, block_group);
        if (!bitmap_bh)
-                goto error_return;
+                return;
        /* Ok, now we can actually update the inode bitmaps.. */
        if (!ext2_clear_bit_atomic(sb_bgl_lock(EXT2_SB(sb), block_group),
@@ -154,7 +153,7 @@ void ext2_free_inode (struct inode * inode)
        mark_buffer_dirty(bitmap_bh);
        if (sb->s_flags & MS_SYNCHRONOUS)
                sync_dirty_buffer(bitmap_bh);
-error_return:
        brelse(bitmap_bh);
 }
@@ -550,16 +549,12 @@ got:
        sb->s_dirt = 1;
        mark_buffer_dirty(bh2);
-        inode->i_uid = current_fsuid();
+        if (test_opt(sb, GRPID)) {
-        if (test_opt (sb, GRPID))
+                inode->i_mode = mode;
-                inode->i_gid = dir->i_gid;
+                inode->i_uid = current_fsuid();
-        else if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino;
        inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fc13cc119aad..527c46d9bc1f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -22,7 +22,6 @@
 *  Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000
 */
-#include <linux/smp_lock.h>
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -1406,11 +1405,11 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
                               /* If this is the first large file
                                * created, add a flag to the superblock.
                                */
-                                lock_kernel();
+                                spin_lock(&EXT2_SB(sb)->s_lock);
                                ext2_update_dynamic_rev(sb);
                                EXT2_SET_RO_COMPAT_FEATURE(sb,
                                        EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
-                                unlock_kernel();
+                                spin_unlock(&EXT2_SB(sb)->s_lock);
                                ext2_write_super(sb);
                        }
                }
@@ -1467,7 +1466,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                return error;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 42e4a303b675..71e9eb1fa696 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -26,7 +26,6 @@
 #include <linux/random.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 #include <linux/vfs.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
@@ -39,7 +38,7 @@
 #include "xip.h"
 static void ext2_sync_super(struct super_block *sb,
-                            struct ext2_super_block *es);
+                            struct ext2_super_block *es, int wait);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -52,9 +51,11 @@ void ext2_error (struct super_block * sb, const char * function,
        struct ext2_super_block *es = sbi->s_es;
        if (!(sb->s_flags & MS_RDONLY)) {
+                spin_lock(&sbi->s_lock);
                sbi->s_mount_state |= EXT2_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT2_ERROR_FS);
-                ext2_sync_super(sb, es);
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        }
        va_start(args, fmt);
@@ -84,6 +85,9 @@ void ext2_msg(struct super_block *sb, const char *prefix,
        va_end(args);
 }
+/*
+ * This must be called with sbi->s_lock held.
+ */
 void ext2_update_dynamic_rev(struct super_block *sb)
 {
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
@@ -115,8 +119,6 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        lock_kernel();
        if (sb->s_dirt)
                ext2_write_super(sb);
@@ -124,8 +126,10 @@ static void ext2_put_super (struct super_block * sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
+                spin_lock(&sbi->s_lock);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext2_sync_super(sb, es);
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        }
        db_count = sbi->s_gdb_count;
        for (i = 0; i < db_count; i++)
@@ -140,8 +144,6 @@ static void ext2_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        unlock_kernel();
 }
 static struct kmem_cache * ext2_inode_cachep;
@@ -209,6 +211,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
        struct ext2_super_block *es = sbi->s_es;
        unsigned long def_mount_opts;
+        spin_lock(&sbi->s_lock);
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        if (sbi->s_sb_block != 1)
@@ -281,6 +284,7 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
+        spin_unlock(&sbi->s_lock);
        return 0;
 }
@@ -606,7 +610,6 @@ static int ext2_setup_super (struct super_block * sb,
        if (!le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
-        ext2_write_super(sb);
        if (test_opt (sb, DEBUG))
                ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
                        "bpg=%lu, ipg=%lu, mo=%04lx]",
@@ -767,6 +770,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
+        spin_lock_init(&sbi->s_lock);
        /*
         * See what the current blocksize for the device is, and
         * use that as the blocksize.  Otherwise (or if the blocksize
@@ -1079,7 +1084,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
                ext2_msg(sb, KERN_WARNING,
                        "warning: mounting ext3 filesystem as ext2");
-        ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+        if (ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY))
+                sb->s_flags |= MS_RDONLY;
+        ext2_write_super(sb);
        return 0;
 cantfind_ext2:
@@ -1120,30 +1127,26 @@ static void ext2_clear_super_error(struct super_block *sb)
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
+                ext2_msg(sb, KERN_ERR,
-                       "superblock detected", sb->s_id);
+                       "previous I/O error to superblock detected\n");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
 }
-static void ext2_commit_super (struct super_block * sb,
+static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
-                               struct ext2_super_block * es)
+                            int wait)
-{
-        ext2_clear_super_error(sb);
-        es->s_wtime = cpu_to_le32(get_seconds());
-        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-        sb->s_dirt = 0;
-}
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
        ext2_clear_super_error(sb);
+        spin_lock(&EXT2_SB(sb)->s_lock);
        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
        es->s_wtime = cpu_to_le32(get_seconds());
+        /* unlock before we do IO */
+        spin_unlock(&EXT2_SB(sb)->s_lock);
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
-        sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+        if (wait)
+                sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
        sb->s_dirt = 0;
 }
@@ -1157,43 +1160,18 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 * may have been checked while mounted and e2fsck may have
 * set s_state to EXT2_VALID_FS after some corrections.
 */
 static int ext2_sync_fs(struct super_block *sb, int wait)
 {
+        struct ext2_sb_info *sbi = EXT2_SB(sb);
        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
-        struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
-        lock_kernel();
-        if (buffer_write_io_error(sbh)) {
-                /*
-                 * Oh, dear.  A previous attempt to write the
-                 * superblock failed.  This could happen because the
-                 * USB device was yanked out.  Or it could happen to
-                 * be a transient write error and maybe the block will
-                 * be remapped.  Nothing we can do but to retry the
-                 * write and hope for the best.
-                 */
-                ext2_msg(sb, KERN_ERR,
-                       "previous I/O error to superblock detected\n");
-                clear_buffer_write_io_error(sbh);
-                set_buffer_uptodate(sbh);
-        }
+        spin_lock(&sbi->s_lock);
        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
                ext2_debug("setting valid to 0\n");
                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-                es->s_free_blocks_count =
-                        cpu_to_le32(ext2_count_free_blocks(sb));
-                es->s_free_inodes_count =
-                        cpu_to_le32(ext2_count_free_inodes(sb));
-                es->s_mtime = cpu_to_le32(get_seconds());
-                ext2_sync_super(sb, es);
-        } else {
-                ext2_commit_super(sb, es);
        }
-        sb->s_dirt = 0;
+        spin_unlock(&sbi->s_lock);
-        unlock_kernel();
+        ext2_sync_super(sb, es, wait);
        return 0;
 }
@@ -1215,7 +1193,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
-        lock_kernel();
+        spin_lock(&sbi->s_lock);
        /* Store the old options */
        old_sb_flags = sb->s_flags;
@@ -1254,13 +1232,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
-                unlock_kernel();
+                spin_unlock(&sbi->s_lock);
                return 0;
        }
        if (*flags & MS_RDONLY) {
                if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
                    !(sbi->s_mount_state & EXT2_VALID_FS)) {
-                        unlock_kernel();
+                        spin_unlock(&sbi->s_lock);
                        return 0;
                }
                /*
@@ -1269,6 +1247,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                 */
                es->s_state = cpu_to_le16(sbi->s_mount_state);
                es->s_mtime = cpu_to_le32(get_seconds());
+                spin_unlock(&sbi->s_lock);
+                ext2_sync_super(sb, es, 1);
        } else {
                __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
                                               ~EXT2_FEATURE_RO_COMPAT_SUPP);
@@ -1288,16 +1268,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_state = le16_to_cpu(es->s_state);
                if (!ext2_setup_super (sb, es, 0))
                        sb->s_flags &= ~MS_RDONLY;
+                spin_unlock(&sbi->s_lock);
+                ext2_write_super(sb);
        }
-        ext2_sync_super(sb, es);
-        unlock_kernel();
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sb->s_flags = old_sb_flags;
-        unlock_kernel();
+        spin_unlock(&sbi->s_lock);
        return err;
 }
@@ -1308,6 +1288,8 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        struct ext2_super_block *es = sbi->s_es;
        u64 fsid;
+        spin_lock(&sbi->s_lock);
        if (test_opt (sb, MINIX_DF))
                sbi->s_overhead_last = 0;
        else if (sbi->s_blocks_last != le32_to_cpu(es->s_blocks_count)) {
@@ -1362,6 +1344,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+        spin_unlock(&sbi->s_lock);
        return 0;
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e44dc92609be..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 static struct mb_cache *ext2_xattr_cache;
-static struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
        [EXT2_XATTR_INDEX_USER]              = &ext2_xattr_user_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
        [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler *ext2_xattr_handlers[] = {
        &ext2_xattr_user_handler,
        &ext2_xattr_trusted_handler,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext2_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
                handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
        /* list the attribute names */
        for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
             entry = EXT2_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext2_xattr_handler(entry->e_name_index);
                if (handler) {
@@ -345,7 +345,9 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
        if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
                return;
+        spin_lock(&EXT2_SB(sb)->s_lock);
        EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
+        spin_unlock(&EXT2_SB(sb)->s_lock);
        sb->s_dirt = 1;
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 }
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
 # ifdef CONFIG_EXT2_FS_XATTR
-extern struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_user_handler;
-extern struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
-extern struct xattr_handler ext2_xattr_acl_access_handler;
+extern const struct xattr_handler ext2_xattr_acl_access_handler;
-extern struct xattr_handler ext2_xattr_acl_default_handler;
+extern const struct xattr_handler ext2_xattr_acl_default_handler;
-extern struct xattr_handler ext2_xattr_security_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
 extern int init_ext2_xattr(void);
 extern void exit_ext2_xattr(void);
-extern struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler *ext2_xattr_handlers[];
 # else  /* CONFIG_EXT2_FS_XATTR */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6d..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext2_xattr_security_handler = {
+const struct xattr_handler ext2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext2_xattr_security_list,
        .get    = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-struct xattr_handler ext2_xattr_trusted_handler = {
+const struct xattr_handler ext2_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext2_xattr_trusted_list,
        .get    = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext2_xattr_user_handler = {
+const struct xattr_handler ext2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext2_xattr_user_list,
        .get    = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..01552abbca3c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext3_xattr_acl_access_handler = {
+const struct xattr_handler ext3_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
        .set    = ext3_xattr_set_acl,
 };
-struct xattr_handler ext3_xattr_acl_default_handler = {
+const struct xattr_handler ext3_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a177122a1b25..4a32511f4ded 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1584,6 +1584,12 @@ retry_alloc:
                        goto io_error;
                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
                /*
+                 * skip this group (and avoid loading bitmap) if there
+                 * are no free blocks
+                 */
+                if (!free_blocks)
+                        continue;
+                /*
                 * skip this group if the number of
                 * free blocks is less than half of the reservation
                 * window size.
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..fcf7487734b6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,7 +48,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
        struct inode *inode = dentry->d_inode;
        struct ext3_inode_info *ei = EXT3_I(inode);
        journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
-        int ret = 0;
+        int ret, needs_barrier = 0;
        tid_t commit_tid;
        if (inode->i_sb->s_flags & MS_RDONLY)
@@ -70,28 +70,27 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext3_should_journal_data(inode)) {
+        if (ext3_should_journal_data(inode))
-                ret = ext3_force_commit(inode->i_sb);
+                return ext3_force_commit(inode->i_sb);
-                goto out;
-        }
        if (datasync)
                commit_tid = atomic_read(&ei->i_datasync_tid);
        else
                commit_tid = atomic_read(&ei->i_sync_tid);
-        if (log_start_commit(journal, commit_tid)) {
+        if (test_opt(inode->i_sb, BARRIER) &&
-                log_wait_commit(journal, commit_tid);
+            !journal_trans_will_send_data_barrier(journal, commit_tid))
-                goto out;
+                needs_barrier = 1;
-        }
+        log_start_commit(journal, commit_tid);
+        ret = log_wait_commit(journal, commit_tid);
        /*
         * In case we didn't commit a transaction, we have to flush
         * disk caches manually so that data really is on persistent
         * storage
         */
-        if (test_opt(inode->i_sb, BARRIER))
+        if (needs_barrier)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
-out:
+                                BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff6..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        inode->i_uid = current_fsuid();
-        if (test_opt (sb, GRPID))
+        if (test_opt(sb, GRPID)) {
-                inode->i_gid = dir->i_gid;
+                inode->i_mode = mode;
-        else if (dir->i_mode & S_ISGID) {
+                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino;
        /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ea33bdf0a300..735f0190ec2a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3151,7 +3151,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1bee604cc6cd..0fc1293d0e96 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -653,8 +653,12 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
-        if (test_opt(sb, BARRIER))
-                seq_puts(seq, ",barrier=1");
+        /*
+         * Always display barrier state so it's clear what the status is.
+         */
+        seq_puts(seq, ",barrier=");
+        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
@@ -810,8 +814,8 @@ enum {
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-        Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+        Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_usrquota, Opt_grpquota
+        Opt_resize, Opt_usrquota, Opt_grpquota
 };
 static const match_table_t tokens = {
@@ -865,6 +869,8 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_resize, "resize"},
        {Opt_err, NULL},
 };
@@ -967,7 +973,11 @@ static int parse_options (char *options, struct super_block *sb,
                int token;
                if (!*p)
                        continue;
+                /*
+                 * Initialize args struct so we know whether arg was
+                 * found; some options take optional arguments.
+                 */
+                args[0].to = args[0].from = 0;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
@@ -1215,9 +1225,15 @@ set_qf_format:
                case Opt_abort:
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
+                case Opt_nobarrier:
+                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        break;
                case Opt_barrier:
-                        if (match_int(&args[0], &option))
+                        if (args[0].from) {
-                                return 0;
+                                if (match_int(&args[0], &option))
+                                        return 0;
+                        } else
+                                option = 1;     /* No argument, default to 1 */
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1890,21 +1906,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext3_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext3_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext3_count_dirs(sb));
-        }
-        if (err) {
-                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
-                goto failed_mount3;
-        }
        /* per fileystem reservation list head & lock */
        spin_lock_init(&sbi->s_rsv_window_lock);
        sbi->s_rsv_window_root = RB_ROOT;
@@ -1945,15 +1946,29 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (!test_opt(sb, NOLOAD) &&
            EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
                if (ext3_load_journal(sb, es, journal_devnum))
-                        goto failed_mount3;
+                        goto failed_mount2;
        } else if (journal_inum) {
                if (ext3_create_journal(sb, es, journal_inum))
-                        goto failed_mount3;
+                        goto failed_mount2;
        } else {
                if (!silent)
                        ext3_msg(sb, KERN_ERR,
                                "error: no journal found. "
                                "mounting ext3 over ext2?");
+                goto failed_mount2;
+        }
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                        ext3_count_free_blocks(sb));
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                ext3_count_free_inodes(sb));
+        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                ext3_count_dirs(sb));
+        }
+        if (err) {
+                ext3_msg(sb, KERN_ERR, "error: insufficient memory");
                goto failed_mount3;
        }
@@ -1978,7 +1993,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                        ext3_msg(sb, KERN_ERR,
                                "error: journal does not support "
                                "requested data journaling mode");
-                        goto failed_mount4;
+                        goto failed_mount3;
                }
        default:
                break;
@@ -2001,19 +2016,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (IS_ERR(root)) {
                ext3_msg(sb, KERN_ERR, "error: get root inode failed");
                ret = PTR_ERR(root);
-                goto failed_mount4;
+                goto failed_mount3;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
                ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck");
-                goto failed_mount4;
+                goto failed_mount3;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                ext3_msg(sb, KERN_ERR, "error: get root dentry failed");
                iput(root);
                ret = -ENOMEM;
-                goto failed_mount4;
+                goto failed_mount3;
        }
        ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -2039,12 +2054,11 @@ cantfind_ext3:
                       sb->s_id);
        goto failed_mount;
-failed_mount4:
-        journal_destroy(sbi->s_journal);
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
+        journal_destroy(sbi->s_journal);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -2278,6 +2292,9 @@ static int ext3_load_journal(struct super_block *sb,
                        return -EINVAL;
        }
+        if (!(journal->j_flags & JFS_BARRIER))
+                printk(KERN_INFO "EXT3-fs: barriers not enabled\n");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = journal_update_format(journal);
                if (err)  {
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
 static struct mb_cache *ext3_xattr_cache;
-static struct xattr_handler *ext3_xattr_handler_map[] = {
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
        [EXT3_XATTR_INDEX_USER]              = &ext3_xattr_user_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
        [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext3_xattr_handlers[] = {
+const struct xattr_handler *ext3_xattr_handlers[] = {
        &ext3_xattr_user_handler,
        &ext3_xattr_trusted_handler,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext3_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
                handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
        size_t rest = buffer_size;
        for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext3_xattr_handler(entry->e_name_index);
                if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
 # ifdef CONFIG_EXT3_FS_XATTR
-extern struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_user_handler;
-extern struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern struct xattr_handler ext3_xattr_acl_access_handler;
+extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern struct xattr_handler ext3_xattr_acl_default_handler;
+extern const struct xattr_handler ext3_xattr_acl_default_handler;
-extern struct xattr_handler ext3_xattr_security_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
 extern int init_ext3_xattr(void);
 extern void exit_ext3_xattr(void);
-extern struct xattr_handler *ext3_xattr_handlers[];
+extern const struct xattr_handler *ext3_xattr_handlers[];
 # else  /* CONFIG_EXT3_FS_XATTR */
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476dff..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext3_xattr_security_handler = {
+const struct xattr_handler ext3_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext3_xattr_security_list,
        .get    = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-struct xattr_handler ext3_xattr_trusted_handler = {
+const struct xattr_handler ext3_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext3_xattr_trusted_list,
        .get    = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext3_xattr_user_handler = {
+const struct xattr_handler ext3_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext3_xattr_user_list,
        .get    = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
        return error;
 }
-struct xattr_handler ext4_xattr_acl_access_handler = {
+const struct xattr_handler ext4_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
        .set    = ext4_xattr_set_acl,
 };
-struct xattr_handler ext4_xattr_acl_default_handler = {
+const struct xattr_handler ext4_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                if (ext4_should_writeback_data(inode) &&
                    (journal->j_fs_dev != journal->j_dev) &&
                    (journal->j_flags & JBD2_BARRIER))
-                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+                                        NULL, BLKDEV_IFL_WAIT);
                jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd6..1a0e183a2f04 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -979,16 +979,12 @@ got:
                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
-        inode->i_uid = current_fsuid();
+        if (test_opt(sb, GRPID)) {
-        if (test_opt(sb, GRPID))
+                inode->i_mode = mode;
+                inode->i_uid = current_fsuid();
                inode->i_gid = dir->i_gid;
-        else if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
        } else
-                inode->i_gid = current_fsgid();
+                inode_init_owner(inode, dir, mode);
-        inode->i_mode = mode;
        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d605412844..3e0f6af9d08d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5425,7 +5425,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
                (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..2de0e9515089 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
 static struct mb_cache *ext4_xattr_cache;
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 #endif
 };
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
        NULL
 };
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
 ext4_xattr_handler(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
                handler = ext4_xattr_handler_map[name_index];
@@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
        size_t rest = buffer_size;
        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-                struct xattr_handler *handler =
+                const struct xattr_handler *handler =
                        ext4_xattr_handler(entry->e_name_index);
                if (handler) {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
 # ifdef CONFIG_EXT4_FS_XATTR
-extern struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 extern int init_ext4_xattr(void);
 extern void exit_ext4_xattr(void);
-extern struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler *ext4_xattr_handlers[];
 # else  /* CONFIG_EXT4_FS_XATTR */
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df07..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
        return err;
 }
-struct xattr_handler ext4_xattr_security_handler = {
+const struct xattr_handler ext4_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ext4_xattr_security_list,
        .get    = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ext4_xattr_trusted_list,
        .get    = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ext4_xattr_user_list,
        .get    = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_error(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error_ratelimit(sb,
-                                     " (i_pos %lld)", __func__,
+                                        "%s: detected the cluster chain loop"
-                                     MSDOS_I(inode)->i_pos);
+                                        " (i_pos %lld)", __func__,
+                                        MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_error(sb, "%s: invalid cluster chain"
+                        fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
-                                     " (i_pos %lld)", __func__,
+                                               " (i_pos %lld)", __func__,
-                                     MSDOS_I(inode)->i_pos);
+                                               MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include <linux/kernel.h>
 #include "fat.h"
 /*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 {
        const wchar_t *ip;
        wchar_t ec;
-        unsigned char *op, nc;
+        unsigned char *op;
        int charlen;
-        int k;
        ip = uni;
        op = ascii;
        while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
                ec = *ip++;
-                if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+                if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
                        op += charlen;
                        len -= charlen;
                } else {
                        if (uni_xlate == 1) {
-                                *op = ':';
+                                *op++ = ':';
-                                for (k = 4; k > 0; k--) {
+                                op = pack_hex_byte(op, ec >> 8);
-                                        nc = ec & 0xF;
+                                op = pack_hex_byte(op, ec);
-                                        op[k] = nc > 9  ? nc + ('a' - 10)
-                                                        : nc + '0';
-                                        ec >>= 4;
-                                }
-                                op += 5;
                                len -= 5;
                        } else {
                                *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
        return ret;
 }
-static int fat_dir_ioctl(struct inode *inode, struct file *filp,
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
-                         unsigned int cmd, unsigned long arg)
+                          unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
        int short_only, both;
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
                both = 1;
                break;
        default:
-                return fat_generic_ioctl(inode, filp, cmd, arg);
+                return fat_generic_ioctl(filp, cmd, arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
                both = 1;
                break;
        default:
-                return -ENOIOCTLCMD;
+                return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = fat_readdir,
-        .ioctl          = fat_dir_ioctl,
+        .unlocked_ioctl = fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..53dba57b49a1 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
 #include <linux/nls.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
 /*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
        struct fatent_operations *fatent_ops;
        struct inode *fat_inode;
+        struct ratelimit_state ratelimit;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
 };
@@ -298,8 +301,8 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
 /* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
-                             unsigned int cmd, unsigned long arg);
+                              unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
+extern void
-        __attribute__ ((format (printf, 2, 3))) __cold;
+__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(s, fmt, args...)           \
+        __fat_fs_error(s, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(s, fmt, args...) \
+        __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..a14c2f6a489e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
        return err;
 }
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                      unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 __user *user_attr = (u32 __user *)arg;
        switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
        }
 }
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+                                      unsigned long arg)
+{
+        return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
        if ((filp->f_mode & FMODE_WRITE) &&
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
-        .ioctl          = fat_generic_ioctl,
+        .unlocked_ioctl = fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = fat_generic_compat_ioctl,
+#endif
        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..ed33904926ee 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1250,6 +1250,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
        sbi->dir_ops = fs_dir_inode_ops;
+        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+                             DEFAULT_RATELIMIT_BURST);
        error = parse_options(data, isvfat, silent, &debug, &sbi->options);
        if (error)
@@ -1497,10 +1499,8 @@ out_fail:
                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
-        if (sbi->nls_io)
+        unload_nls(sbi->nls_io);
-                unload_nls(sbi->nls_io);
+        unload_nls(sbi->nls_disk);
-        if (sbi->nls_disk)
-                unload_nls(sbi->nls_disk);
        if (sbi->options.iocharset != fat_default_iocharset)
                kfree(sbi->options.iocharset);
        sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
-void fat_fs_error(struct super_block *s, const char *fmt, ...)
+void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
 {
        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
+        if (report) {
+                printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-        printk(KERN_ERR "    ");
+                printk(KERN_ERR "    ");
-        va_start(args, fmt);
+                va_start(args, fmt);
-        vprintk(fmt, args);
+                vprintk(fmt, args);
-        va_end(args);
+                va_end(args);
-        printk("\n");
+                printk("\n");
+        }
        if (opts->errors == FAT_ERRORS_PANIC)
-                panic("    FAT fs panic from previous error\n");
+                panic("FAT: fs panic from previous error\n");
        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "    File system has been set read-only\n");
+                printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
        }
 }
-EXPORT_SYMBOL_GPL(fat_fs_error);
+EXPORT_SYMBOL_GPL(__fat_fs_error);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f9075e..f74d270ba155 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
 #include <linux/dnotify.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/pipe_fs_i.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, arg);
                break;
+        case F_SETPIPE_SZ:
+        case F_GETPIPE_SZ:
+                err = pipe_fcntl(filp, cmd, arg);
+                break;
        default:
                break;
        }
@@ -614,9 +619,15 @@ int send_sigurg(struct fown_struct *fown)
        return ret;
 }
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+        kmem_cache_free(fasync_cache,
+                        container_of(head, struct fasync_struct, fa_rcu));
+}
 /*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +636,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
 */
 static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 {
@@ -634,17 +643,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
        int result = 0;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
+                fa->fa_file = NULL;
+                spin_unlock_irq(&fa->fa_lock);
                *fp = fa->fa_next;
-                kmem_cache_free(fasync_cache, fa);
+                call_rcu(&fa->fa_rcu, fasync_free_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -666,25 +680,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
                return -ENOMEM;
        spin_lock(&filp->f_lock);
-        write_lock_irq(&fasync_lock);
+        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;
+                spin_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
+                spin_unlock_irq(&fa->fa_lock);
                kmem_cache_free(fasync_cache, new);
                goto out;
        }
+        spin_lock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
-        *fapp = new;
+        rcu_assign_pointer(*fapp, new);
        result = 1;
        filp->f_flags |= FASYNC;
 out:
-        write_unlock_irq(&fasync_lock);
+        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
 }
@@ -704,37 +723,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 {
        while (fa) {
-                struct fown_struct * fown;
+                struct fown_struct *fown;
                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
-                fown = &fa->fa_file->f_owner;
+                spin_lock(&fa->fa_lock);
-                /* Don't send SIGURG to processes which have not set a
+                if (fa->fa_file) {
-                   queued signum: SIGURG has its own default signalling
+                        fown = &fa->fa_file->f_owner;
-                   mechanism. */
+                        /* Don't send SIGURG to processes which have not set a
-                if (!(sig == SIGURG && fown->signum == 0))
+                           queued signum: SIGURG has its own default signalling
-                        send_sigio(fown, fa->fa_fd, band);
+                           mechanism. */
-                fa = fa->fa_next;
+                        if (!(sig == SIGURG && fown->signum == 0))
+                                send_sigio(fown, fa->fa_fd, band);
+                }
+                spin_unlock(&fa->fa_lock);
+                fa = rcu_dereference(fa->fa_next);
        }
 }
-EXPORT_SYMBOL(__kill_fasync);
 void kill_fasync(struct fasync_struct **fp, int sig, int band)
 {
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
-                read_lock(&fasync_lock);
+                rcu_read_lock();
-                /* reread *fp after obtaining the lock */
+                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
-                __kill_fasync(*fp, sig, band);
+                rcu_read_unlock();
-                read_unlock(&fasync_lock);
        }
 }
 EXPORT_SYMBOL(kill_fasync);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..ea8592b90696 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,9 +42,10 @@ struct wb_writeback_args {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
-        int for_kupdate:1;
+        unsigned int for_kupdate:1;
-        int range_cyclic:1;
+        unsigned int range_cyclic:1;
-        int for_background:1;
+        unsigned int for_background:1;
+        unsigned int sb_pinned:1;
 };
 /*
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                 struct wb_writeback_args *args)
+                                 struct wb_writeback_args *args,
+                                 int wait)
 {
        struct bdi_work *work;
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
        if (work) {
                bdi_work_init(work, args);
                bdi_queue_work(bdi, work);
+                if (wait)
+                        bdi_wait_on_work_clear(work);
        } else {
                struct bdi_writeback *wb = &bdi->wb;
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
+                /*
+                 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+                 * lets make it explicitly clear.
+                 */
+                .sb_pinned      = 1,
        };
        struct bdi_work work;
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 * @bdi: the backing device to write from
 * @sb: write inodes from this super_block
 * @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
 *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   completion. Caller specifies whether sb umount sem is held already or not.
 *
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                         long nr_pages)
+                         long nr_pages, int sb_locked)
 {
        struct wb_writeback_args args = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .nr_pages       = nr_pages,
                .range_cyclic   = 1,
+                .sb_pinned      = sb_locked,
        };
        /*
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
                args.for_background = 1;
        }
-        bdi_alloc_queue_work(bdi, &args);
+        bdi_alloc_queue_work(bdi, &args, sb_locked);
 }
 /*
@@ -398,11 +409,11 @@ static void inode_wait_for_writeback(struct inode *inode)
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-        do {
+         while (inode->i_state & I_SYNC) {
                spin_unlock(&inode_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
                spin_lock(&inode_lock);
-        } while (inode->i_state & I_SYNC);
+        }
 }
 /*
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        BUG_ON(inode->i_state & I_SYNC);
-        /* Set I_SYNC, reset I_DIRTY */
+        /* Set I_SYNC, reset I_DIRTY_PAGES */
-        dirty = inode->i_state & I_DIRTY;
        inode->i_state |= I_SYNC;
-        inode->i_state &= ~I_DIRTY;
+        inode->i_state &= ~I_DIRTY_PAGES;
        spin_unlock(&inode_lock);
        ret = do_writepages(mapping, wbc);
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
+        /*
+         * Some filesystems may redirty the inode during the writeback
+         * due to delalloc, clear dirty metadata flags right before
+         * write_inode()
+         */
+        spin_lock(&inode_lock);
+        dirty = inode->i_state & I_DIRTY;
+        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+        spin_unlock(&inode_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
        /*
         * Caller must already hold the ref for this
         */
-        if (wbc->sync_mode == WB_SYNC_ALL) {
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
                return SB_NOT_PINNED;
        }
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                .for_kupdate            = args->for_kupdate,
                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
+                .sb_pinned              = args->sb_pinned,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
        unsigned long expired;
        long nr_pages;
+        /*
+         * When set to zero, disable periodic writeback
+         */
+        if (!dirty_writeback_interval)
+                return 0;
        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
                struct wb_writeback_args args = work->args;
+                int post_clear;
                /*
                 * Override sync mode, in case we must wait for completion
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                if (force_wait)
                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+                post_clear = WB_SYNC_ALL || args.sb_pinned;
                /*
                 * If this isn't a data integrity operation, just notify
                 * that we have seen this work and we are now starting it.
                 */
-                if (args.sync_mode == WB_SYNC_NONE)
+                if (!post_clear)
                        wb_clear_pending(wb, work);
                wrote += wb_writeback(wb, &args);
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                 * This is a data integrity writeback, so only do the
                 * notification when we have completed the work.
                 */
-                if (args.sync_mode == WB_SYNC_ALL)
+                if (post_clear)
                        wb_clear_pending(wb, work);
        }
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
                                break;
                }
-                wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+                if (dirty_writeback_interval) {
-                schedule_timeout_interruptible(wait_jiffies);
+                        wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+                        schedule_timeout_interruptible(wait_jiffies);
+                } else {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        if (list_empty_careful(&wb->bdi->work_list) &&
+                            !kthread_should_stop())
+                                schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
                try_to_freeze();
        }
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                bdi_alloc_queue_work(bdi, &args);
+                bdi_alloc_queue_work(bdi, &args, 0);
        }
        rcu_read_unlock();
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+        long nr_to_write;
+        nr_to_write = nr_dirty + nr_unstable +
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+        bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
 /**
 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
 * @sb: the superblock
@@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
 */
 void writeback_inodes_sb(struct super_block *sb)
 {
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+        __writeback_inodes_sb(sb, 0);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        long nr_to_write;
-        nr_to_write = nr_dirty + nr_unstable +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 /**
+ * writeback_inodes_sb_locked   - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+        __writeback_inodes_sb(sb, 1);
+}
+/**
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
 * @sb: the superblock
 *
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df5457656..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-struct xattr_handler generic_acl_access_handler = {
+const struct xattr_handler generic_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = generic_acl_list,
@@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
        .set    = generic_acl_set,
 };
-struct xattr_handler generic_acl_default_handler = {
+const struct xattr_handler generic_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
                                 void *buffer, size_t size, int xtype)
 {
        struct inode *inode = dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct posix_acl *acl;
        int type;
        int error;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
        type = gfs2_acl_type(name);
        if (type < 0)
                return type;
@@ -335,7 +339,7 @@ out:
        return error;
 }
-struct xattr_handler gfs2_xattr_system_handler = {
+const struct xattr_handler gfs2_xattr_system_handler = {
        .prefix = XATTR_SYSTEM_PREFIX,
        .flags  = GFS2_EATYPE_SYS,
        .get    = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
 extern int gfs2_check_acl(struct inode *inode, int mask);
 extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
 extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern struct xattr_handler gfs2_xattr_system_handler;
+extern const struct xattr_handler gfs2_xattr_system_handler;
 #endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0c1d0b82dcf1..a739a0a48067 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -418,6 +418,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
        struct buffer_head *dibh;
+        u64 dsize = i_size_read(&ip->i_inode);
        void *kaddr;
        int error;
@@ -437,9 +438,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
                return error;
        kaddr = kmap_atomic(page, KM_USER0);
-        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+        if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
-               ip->i_disksize);
+                dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
-        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
+        memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e411d5f4697..4a48c0f4b402 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -71,11 +71,13 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (!PageUptodate(page)) {
                void *kaddr = kmap(page);
+                u64 dsize = i_size_read(inode);
+ 
+                if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
+                        dsize = dibh->b_size - sizeof(struct gfs2_dinode);
-                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-                       ip->i_disksize);
+                memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
-                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -1038,13 +1040,14 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                ip->i_disksize = size;
+                u64 dsize = size + sizeof(struct gfs2_inode);
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
-                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+                if (dsize > dibh->b_size)
+                        dsize = dibh->b_size;
+                gfs2_buffer_clear_tail(dibh, dsize);
                error = 1;
        } else {
                if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 25fddc100f18..8295c5b5d4a9 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1475,7 +1475,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
                inode = gfs2_inode_lookup(dir->i_sb, 
                                be16_to_cpu(dent->de_type),
                                be64_to_cpu(dent->de_inum.no_addr),
-                                be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+                                be64_to_cpu(dent->de_inum.no_formal_ino));
                brelse(bh);
                return inode;
        }
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index c22c21174833..dfe237a3f8ad 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -168,7 +168,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        if (error)
                goto fail;
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..b20bfcc9fa2d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
+        error = -EACCES;
+        if (!is_owner_or_cap(inode))
+                goto out;
+        error = 0;
        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 fsflags, gfsflags;
        if (get_user(fsflags, ptr))
                return -EFAULT;
        gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 454d4b4eb36b..ddcdbf493536 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -855,6 +855,9 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
        gh->gh_flags = flags;
        gh->gh_iflags = 0;
        gh->gh_ip = (unsigned long)__builtin_return_address(0);
+        if (gh->gh_owner_pid)
+                put_pid(gh->gh_owner_pid);
+        gh->gh_owner_pid = get_pid(task_pid(current));
 }
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3aac46f6853e..b5d7363b22da 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -439,9 +439,6 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_incore_log_blocks;
-        unsigned int gt_log_flush_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -462,6 +459,7 @@ enum {
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
        SDF_NORECOVERY          = 4,
+        SDF_DEMOTE              = 5,
 };
 #define GFS2_FSNAME_LEN         256
@@ -618,6 +616,7 @@ struct gfs2_sbd {
        unsigned int sd_log_commited_databuf;
        int sd_log_commited_revoke;
+        atomic_t sd_log_pinned;
        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
        unsigned int sd_log_num_rg;
@@ -629,15 +628,17 @@ struct gfs2_sbd {
        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
+        atomic_t sd_log_thresh1;
+        atomic_t sd_log_thresh2;
        atomic_t sd_log_blks_free;
-        struct mutex sd_log_reserve_mutex;
+        wait_queue_head_t sd_log_waitq;
+        wait_queue_head_t sd_logd_waitq;
        u64 sd_log_sequence;
        unsigned int sd_log_head;
        unsigned int sd_log_tail;
        int sd_log_idle;
-        unsigned long sd_log_flush_time;
        struct rw_semaphore sd_log_flush_lock;
        atomic_t sd_log_in_flight;
        wait_queue_head_t sd_log_flush_wait;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b1bf2694fb2b..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -158,7 +158,6 @@ void gfs2_set_iop(struct inode *inode)
 * @sb: The super block
 * @no_addr: The inode number
 * @type: The type of the inode
- * @skip_freeing: set this not return an inode if it is currently being freed.
 *
 * Returns: A VFS inode, or an error
 */
@@ -166,17 +165,14 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb,
                                unsigned int type,
                                u64 no_addr,
-                                u64 no_formal_ino, int skip_freeing)
+                                u64 no_formal_ino)
 {
        struct inode *inode;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl;
        int error;
-        if (skip_freeing)
+        inode = gfs2_iget(sb, no_addr);
-                inode = gfs2_iget_skip(sb, no_addr);
-        else
-                inode = gfs2_iget(sb, no_addr);
        ip = GFS2_I(inode);
        if (!inode)
@@ -234,11 +230,102 @@ fail_glock:
 fail_iopen:
        gfs2_glock_put(io_gl);
 fail_put:
+        if (inode->i_state & I_NEW)
+                ip->i_gl->gl_object = NULL;
+        gfs2_glock_put(ip->i_gl);
+fail:
+        if (inode->i_state & I_NEW)
+                iget_failed(inode);
+        else
+                iput(inode);
+        return ERR_PTR(error);
+}
+/**
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
+ * @sb: The super block
+ * no_addr: The inode number
+ *
+ */
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
+{
+        struct gfs2_sbd *sdp;
+        struct gfs2_inode *ip;
+        struct gfs2_glock *io_gl;
+        int error;
+        struct gfs2_holder gh;
+        struct inode *inode;
+        inode = gfs2_iget_skip(sb, no_addr);
+        if (!inode)
+                return;
+        /* If it's not a new inode, someone's using it, so leave it alone. */
+        if (!(inode->i_state & I_NEW)) {
+                iput(inode);
+                return;
+        }
+        ip = GFS2_I(inode);
+        sdp = GFS2_SB(inode);
+        ip->i_no_formal_ino = -1;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+        if (unlikely(error))
+                goto fail;
+        ip->i_gl->gl_object = ip;
+        error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+        if (unlikely(error))
+                goto fail_put;
+        set_bit(GIF_INVALID, &ip->i_flags);
+        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
+                                   &ip->i_iopen_gh);
+        if (unlikely(error))
+                goto fail_iopen;
+        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        gfs2_glock_put(io_gl);
+        inode->i_mode = DT2IF(DT_UNKNOWN);
+        /*
+         * We must read the inode in order to work out its type in
+         * this case. Note that this doesn't happen often as we normally
+         * know the type beforehand. This code path only occurs during
+         * unlinked inode recovery (where it is safe to do this glock,
+         * which is not true in the general case).
+         */
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
+                                   &gh);
+        if (unlikely(error))
+                goto fail_glock;
+        /* Inode is now uptodate */
+        gfs2_glock_dq_uninit(&gh);
+        gfs2_set_iop(inode);
+        /* The iput will cause it to be deleted. */
+        iput(inode);
+        return;
+fail_glock:
+        gfs2_glock_dq(&ip->i_iopen_gh);
+fail_iopen:
+        gfs2_glock_put(io_gl);
+fail_put:
        ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
        iget_failed(inode);
-        return ERR_PTR(error);
+        return;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
@@ -862,7 +949,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                goto fail_gunlock2;
        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                  inum.no_formal_ino, 0);
+                                  inum.no_formal_ino);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c341aaf67adb..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,8 +83,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                       u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino);
-                                       int skip_freeing);
+extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5bf4b59d46e..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -168,12 +168,11 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
        return list_empty(&ai->ai_ail1_list);
 }
-static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 {
        struct list_head *head;
        u64 sync_gen;
-        struct list_head *first;
+        struct gfs2_ail *ai;
-        struct gfs2_ail *first_ai, *ai, *tmp;
        int done = 0;
        gfs2_log_lock(sdp);
@@ -184,21 +183,9 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
        }
        sync_gen = sdp->sd_ail_sync_gen++;
-        first = head->prev;
-        first_ai = list_entry(first, struct gfs2_ail, ai_list);
-        first_ai->ai_sync_gen = sync_gen;
-        gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
-        if (flags & DIO_ALL)
-                first = NULL;
        while(!done) {
-                if (first && (head->prev != first ||
-                              gfs2_ail1_empty_one(sdp, first_ai, 0)))
-                        break;
                done = 1;
-                list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+                list_for_each_entry_reverse(ai, head, ai_list) {
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
@@ -290,58 +277,57 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 * flush time, so we ensure that we have just enough free blocks at all
 * times to avoid running out during a log flush.
 *
+ * We no longer flush the log here, instead we wake up logd to do that
+ * for us. To avoid the thundering herd and to ensure that we deal fairly
+ * with queued waiters, we use an exclusive wait. This means that when we
+ * get woken with enough journal space to get our reservation, we need to
+ * wake the next waiter on the list.
+ *
 * Returns: errno
 */
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-        unsigned int try = 0;
        unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+        unsigned wanted = blks + reserved_blks;
+        DEFINE_WAIT(wait);
+        int did_wait = 0;
+        unsigned int free_blocks;
        if (gfs2_assert_warn(sdp, blks) ||
            gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
                return -EINVAL;
+retry:
-        mutex_lock(&sdp->sd_log_reserve_mutex);
+        free_blocks = atomic_read(&sdp->sd_log_blks_free);
-        gfs2_log_lock(sdp);
+        if (unlikely(free_blocks <= wanted)) {
-        while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
+                do {
-                gfs2_log_unlock(sdp);
+                        prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
-                gfs2_ail1_empty(sdp, 0);
+                                        TASK_UNINTERRUPTIBLE);
-                gfs2_log_flush(sdp, NULL);
+                        wake_up(&sdp->sd_logd_waitq);
+                        did_wait = 1;
-                if (try++)
+                        if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
-                        gfs2_ail1_start(sdp, 0);
+                                io_schedule();
-                gfs2_log_lock(sdp);
+                        free_blocks = atomic_read(&sdp->sd_log_blks_free);
+                } while(free_blocks <= wanted);
+                finish_wait(&sdp->sd_log_waitq, &wait);
        }
-        atomic_sub(blks, &sdp->sd_log_blks_free);
+        if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
+                                free_blocks - blks) != free_blocks)
+                goto retry;
        trace_gfs2_log_blocks(sdp, -blks);
-        gfs2_log_unlock(sdp);
-        mutex_unlock(&sdp->sd_log_reserve_mutex);
+        /*
+         * If we waited, then so might others, wake them up _after_ we get
+         * our share of the log.
+         */
+        if (unlikely(did_wait))
+                wake_up(&sdp->sd_log_waitq);
        down_read(&sdp->sd_log_flush_lock);
        return 0;
 }
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-        gfs2_log_lock(sdp);
-        atomic_add(blks, &sdp->sd_log_blks_free);
-        trace_gfs2_log_blocks(sdp, blks);
-        gfs2_assert_withdraw(sdp,
-                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
-        gfs2_log_unlock(sdp);
-        up_read(&sdp->sd_log_flush_lock);
-}
 static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
 {
        struct gfs2_journal_extent *je;
@@ -559,11 +545,10 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        ail2_empty(sdp, new_tail);
-        gfs2_log_lock(sdp);
        atomic_add(dist, &sdp->sd_log_blks_free);
        trace_gfs2_log_blocks(sdp, dist);
-        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
-        gfs2_log_unlock(sdp);
+                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_tail = new_tail;
 }
@@ -615,6 +600,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
                set_buffer_uptodate(bh);
+                fs_info(sdp, "barrier sync failed - disabling barriers\n");
                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
                lock_buffer(bh);
 skip_barrier:
@@ -710,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 *
 */
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
        struct gfs2_ail *ai;
@@ -822,6 +808,13 @@ static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 * @sdp: the filesystem
 * @tr: the transaction
 *
+ * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
+ * or the total number of used blocks (pinned blocks plus AIL blocks)
+ * is greater than thresh2.
+ *
+ * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
+ * journal size.
+ *
 * Returns: errno
 */
@@ -832,10 +825,10 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        up_read(&sdp->sd_log_flush_lock);
-        gfs2_log_lock(sdp);
+        if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
-        if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
+            ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
-                wake_up_process(sdp->sd_logd_process);
+            atomic_read(&sdp->sd_log_thresh2)))
-        gfs2_log_unlock(sdp);
+                wake_up(&sdp->sd_logd_waitq);
 }
 /**
@@ -882,13 +875,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 {
        gfs2_log_flush(sdp, NULL);
        for (;;) {
-                gfs2_ail1_start(sdp, DIO_ALL);
+                gfs2_ail1_start(sdp);
                if (gfs2_ail1_empty(sdp, DIO_ALL))
                        break;
                msleep(10);
        }
 }
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+        return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+}
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+        unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
+        return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+}
 /**
 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
@@ -901,28 +904,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
 int gfs2_logd(void *data)
 {
        struct gfs2_sbd *sdp = data;
-        unsigned long t;
+        unsigned long t = 1;
-        int need_flush;
+        DEFINE_WAIT(wait);
+        unsigned preflush;
        while (!kthread_should_stop()) {
-                /* Advance the log tail */
-                t = sdp->sd_log_flush_time +
+                preflush = atomic_read(&sdp->sd_log_pinned);
-                    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+                if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                        gfs2_log_flush(sdp, NULL);
+                        gfs2_ail1_empty(sdp, DIO_ALL);
+                }
-                gfs2_ail1_empty(sdp, DIO_ALL);
+                if (gfs2_ail_flush_reqd(sdp)) {
-                gfs2_log_lock(sdp);
+                        gfs2_ail1_start(sdp);
-                need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
+                        io_schedule();
-                gfs2_log_unlock(sdp);
+                        gfs2_ail1_empty(sdp, 0);
-                if (need_flush || time_after_eq(jiffies, t)) {
                        gfs2_log_flush(sdp, NULL);
-                        sdp->sd_log_flush_time = jiffies;
+                        gfs2_ail1_empty(sdp, DIO_ALL);
                }
+                wake_up(&sdp->sd_log_waitq);
                t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
                if (freezing(current))
                        refrigerator();
-                schedule_timeout_interruptible(t);
+                do {
+                        prepare_to_wait(&sdp->sd_logd_waitq, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        if (!gfs2_ail_flush_reqd(sdp) &&
+                            !gfs2_jrnl_flush_reqd(sdp) &&
+                            !kthread_should_stop())
+                                t = schedule_timeout(t);
+                } while(t && !gfs2_ail_flush_reqd(sdp) &&
+                        !gfs2_jrnl_flush_reqd(sdp) &&
+                        !kthread_should_stop());
+                finish_wait(&sdp->sd_logd_waitq, &wait);
        }
        return 0;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 7c64510ccfd2..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,29 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
        sdp->sd_log_head = sdp->sd_log_tail = value;
 }
-unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
                                      struct buffer_head *real);
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-{
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-        if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+extern int gfs2_logd(void *data);
-                __gfs2_log_flush(sbd, gl);
-}
-void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-int gfs2_logd(void *data);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index adc260fbea90..bf33f822058d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
        get_bh(bh);
+        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
 }
@@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        trace_gfs2_pin(bd, 0);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
+        atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a88fadc704bb..fb2a5f93b7c3 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -94,7 +94,7 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_glock_cachep)
                goto fail;
-        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+        gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)",
                                        sizeof(struct gfs2_glock) +
                                        sizeof(struct address_space),
                                        0, 0, gfs2_init_gl_aspace_once);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0bb12c80937a..18176d0b75d7 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -34,7 +34,6 @@
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-        int err;
        struct buffer_head *bh, *head;
        int nr_underway = 0;
        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
@@ -86,11 +85,10 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        } while (bh != head);
        unlock_page(page);
-        err = 0;
        if (nr_underway == 0)
                end_page_writeback(page);
-        return err;
+        return 0;
 }
 const struct address_space_operations gfs2_meta_aops = {
@@ -313,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
        struct gfs2_bufdata *bd = bh->b_private;
        if (test_clear_buffer_pinned(bh)) {
+                atomic_dec(&sdp->sd_log_pinned);
                list_del_init(&bd->bd_le.le_list);
                if (meta) {
                        gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1309ed1c496..3593b3a7290e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -57,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_incore_log_blocks = 1024;
-        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -101,14 +99,15 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
+        atomic_set(&sdp->sd_log_pinned, 0);
        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
        INIT_LIST_HEAD(&sdp->sd_log_le_rg);
        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
        INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
-        mutex_init(&sdp->sd_log_reserve_mutex);
+        init_waitqueue_head(&sdp->sd_log_waitq);
+        init_waitqueue_head(&sdp->sd_logd_waitq);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -487,7 +486,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
        struct dentry *dentry;
        struct inode *inode;
-        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
        if (IS_ERR(inode)) {
                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
                return PTR_ERR(inode);
@@ -733,6 +732,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        if (sdp->sd_args.ar_spectator) {
                sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
        } else {
                if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
                        fs_err(sdp, "can't mount journal #%u\n",
@@ -770,6 +771,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                        goto fail_jinode_gh;
                }
                atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
+                atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
+                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
                /* Map the extents for this journal's blocks */
                map_journal_extents(sdp);
@@ -951,8 +954,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        if (undo)
                goto fail_quotad;
-        sdp->sd_log_flush_time = jiffies;
        p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
        error = IS_ERR(p);
        if (error) {
@@ -1160,7 +1161,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+        sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
        sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
        if (sdp->sd_args.ar_statfs_quantum) {
                sdp->sd_tune.gt_statfs_slow = 0;
@@ -1323,7 +1324,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        memset(&args, 0, sizeof(args));
        args.ar_quota = GFS2_QUOTA_DEFAULT;
        args.ar_data = GFS2_DATA_DEFAULT;
-        args.ar_commit = 60;
+        args.ar_commit = 30;
        args.ar_statfs_quantum = 30;
        args.ar_quota_quantum = 60;
        args.ar_errors = GFS2_ERRORS_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6dbcbad6ab17..49667d68769e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -637,15 +637,40 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        unsigned blocksize, iblock, pos;
        struct buffer_head *bh, *dibh;
        struct page *page;
-        void *kaddr;
+        void *kaddr, *ptr;
-        struct gfs2_quota *qp;
+        struct gfs2_quota q, *qp;
-        s64 value;
+        int err, nbytes;
-        int err = -EIO;
        u64 size;
        if (gfs2_is_stuffed(ip))
                gfs2_unstuff_dinode(ip, NULL);
-        
+        memset(&q, 0, sizeof(struct gfs2_quota));
+        err = gfs2_internal_read(ip, NULL, (char *)&q, &loc, sizeof(q));
+        if (err < 0)
+                return err;
+        err = -EIO;
+        qp = &q;
+        qp->qu_value = be64_to_cpu(qp->qu_value);
+        qp->qu_value += change;
+        qp->qu_value = cpu_to_be64(qp->qu_value);
+        qd->qd_qb.qb_value = qp->qu_value;
+        if (fdq) {
+                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
+                        qd->qd_qb.qb_warn = qp->qu_warn;
+                }
+                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
+                        qd->qd_qb.qb_limit = qp->qu_limit;
+                }
+        }
+        /* Write the quota into the quota file on disk */
+        ptr = qp;
+        nbytes = sizeof(struct gfs2_quota);
+get_a_page:
        page = grab_cache_page(mapping, index);
        if (!page)
                return -ENOMEM;
@@ -667,7 +692,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        if (!buffer_mapped(bh)) {
                gfs2_block_map(inode, iblock, bh, 1);
                if (!buffer_mapped(bh))
-                        goto unlock;
+                        goto unlock_out;
+                /* If it's a newly allocated disk block for quota, zero it */
+                if (buffer_new(bh)) {
+                        memset(bh->b_data, 0, bh->b_size);
+                        set_buffer_uptodate(bh);
+                }
        }
        if (PageUptodate(page))
@@ -677,32 +707,34 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                ll_rw_block(READ_META, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
-                        goto unlock;
+                        goto unlock_out;
        }
        gfs2_trans_add_bh(ip->i_gl, bh, 0);
        kaddr = kmap_atomic(page, KM_USER0);
-        qp = kaddr + offset;
+        if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
-        value = (s64)be64_to_cpu(qp->qu_value) + change;
+                nbytes = PAGE_CACHE_SIZE - offset;
-        qp->qu_value = cpu_to_be64(value);
+        memcpy(kaddr + offset, ptr, nbytes);
-        qd->qd_qb.qb_value = qp->qu_value;
-        if (fdq) {
-                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-                        qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit);
-                        qd->qd_qb.qb_warn = qp->qu_warn;
-                }
-                if (fdq->d_fieldmask & FS_DQ_BHARD) {
-                        qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit);
-                        qd->qd_qb.qb_limit = qp->qu_limit;
-                }
-        }
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
+        unlock_page(page);
+        page_cache_release(page);
+        /* If quota straddles page boundary, we need to update the rest of the
+         * quota at the beginning of the next page */
+        if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+                ptr = ptr + nbytes;
+                nbytes = sizeof(struct gfs2_quota) - nbytes;
+                offset = 0;
+                index++;
+                goto get_a_page;
+        }
+        /* Update the disk inode timestamp and size (if extended) */
        err = gfs2_meta_inode_buffer(ip, &dibh);
        if (err)
-                goto unlock;
+                goto out;
        size = loc + sizeof(struct gfs2_quota);
        if (size > inode->i_size) {
@@ -715,7 +747,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        brelse(dibh);
        mark_inode_dirty(inode);
-unlock:
+out:
+        return err;
+unlock_out:
        unlock_page(page);
        page_cache_release(page);
        return err;
@@ -779,8 +813,10 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
         * rgrp since it won't be allocated during the transaction
         */
        al->al_requested = 1;
-        /* +1 in the end for block requested above for unstuffing */
+        /* +3 in the end for unstuffing block, inode size update block
-        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
+         * and another block in case quota straddles page boundary and 
+         * two blocks need to be updated instead of 1 */
+        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
        if (nalloc)
                al->al_requested += nalloc * (data_blocks + ind_blocks);                
@@ -1418,10 +1454,18 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        memset(fqs, 0, sizeof(struct fs_quota_stat));
        fqs->qs_version = FS_QSTAT_VERSION;
-        if (sdp->sd_args.ar_quota == GFS2_QUOTA_ON)
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+        switch (sdp->sd_args.ar_quota) {
-        else if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT)
+        case GFS2_QUOTA_ON:
-                fqs->qs_flags = (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD);
+                /*FALLTHRU*/
+        case GFS2_QUOTA_ACCOUNT:
+                fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT);
+                break;
+        case GFS2_QUOTA_OFF:
+                break;
+        }
        if (sdp->sd_quota_inode) {
                fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
                fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
@@ -1432,8 +1476,8 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
        return 0;
 }
-static int gfs2_xquota_get(struct super_block *sb, int type, qid_t id,
+static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
-                           struct fs_disk_quota *fdq)
+                          struct fs_disk_quota *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_lvb *qlvb;
@@ -1477,8 +1521,8 @@ out:
 /* GFS2 only supports a subset of the XFS fields */
 #define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
-static int gfs2_xquota_set(struct super_block *sb, int type, qid_t id,
+static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
-                           struct fs_disk_quota *fdq)
+                          struct fs_disk_quota *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1585,7 +1629,7 @@ out_put:
 const struct quotactl_ops gfs2_quotactl_ops = {
        .quota_sync     = gfs2_quota_sync,
        .get_xstate     = gfs2_quota_get_xstate,
-        .get_xquota     = gfs2_xquota_get,
+        .get_dqblk      = gfs2_get_dqblk,
-        .set_xquota     = gfs2_xquota_set,
+        .set_dqblk      = gfs2_set_dqblk,
 };
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
                                                            nr_sects, GFP_NOFS,
-                                                            DISCARD_FL_BARRIER);
+                                                            BLKDEV_IFL_WAIT |
+                                                            BLKDEV_IFL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
        }
        if (nr_sects) {
                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
-                                         DISCARD_FL_BARRIER);
+                                         BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -948,13 +949,13 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
 * @rgd: The rgrp
 *
- * Returns: The inode, if one has been found
+ * Returns: 0 if no error
+ *          The inode, if one has been found, in inode.
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
-                                     u64 skip)
+                           u64 skip)
 {
-        struct inode *inode;
        u32 goal = 0, block;
        u64 no_addr;
        struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -979,14 +980,11 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
                if (no_addr == skip)
                        continue;
                *last_unlinked = no_addr;
-                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
+                return no_addr;
-                                          no_addr, -1, 1);
-                if (!IS_ERR(inode))
-                        return inode;
        }
        rgd->rd_flags &= ~GFS2_RDF_CHECK;
-        return NULL;
+        return 0;
 }
 /**
@@ -1067,11 +1065,12 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
 * Try to acquire rgrp in way which avoids contending with others.
 *
 * Returns: errno
+ *          unlinked: the block address of an unlinked block to be reclaimed
 */
-static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
+                          u64 *last_unlinked)
 {
-        struct inode *inode = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd, *begin = NULL;
        struct gfs2_alloc *al = ip->i_alloc;
@@ -1080,6 +1079,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
        int loops = 0;
        int error, rg_locked;
+        *unlinked = 0;
        rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
        while (rgd) {
@@ -1096,19 +1096,24 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        /* If the rg came in already locked, there's no
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                           way we can recover from a failed try_rgrp_unlink
+                           because that would require an iput which can only
+                           happen after the rgrp is unlocked. */
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                           ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        /* fall through */
                case GLR_TRYFAILED:
                        rgd = recent_rgrp_next(rgd);
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
        }
@@ -1130,12 +1135,13 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                case 0:
                        if (try_rgrp_fit(rgd, al))
                                goto out;
-                        if (rgd->rd_flags & GFS2_RDF_CHECK)
+                        if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
+                                *unlinked = try_rgrp_unlink(rgd, last_unlinked,
+                                                            ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
-                        if (inode)
+                        if (*unlinked)
-                                return inode;
+                                return -EAGAIN;
                        break;
                case GLR_TRYFAILED:
@@ -1143,7 +1149,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        break;
                default:
-                        return ERR_PTR(error);
+                        return error;
                }
                rgd = gfs2_rgrpd_get_next(rgd);
@@ -1152,7 +1158,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                if (rgd == begin) {
                        if (++loops >= 3)
-                                return ERR_PTR(-ENOSPC);
+                                return -ENOSPC;
                        if (!skipped)
                                loops++;
                        flags = 0;
@@ -1172,7 +1178,7 @@ out:
                forward_rgrp_set(sdp, rgd);
        }
-        return NULL;
+        return 0;
 }
 /**
@@ -1186,9 +1192,8 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
-        struct inode *inode;
        int error = 0;
-        u64 last_unlinked = NO_BLOCK;
+        u64 last_unlinked = NO_BLOCK, unlinked;
        if (gfs2_assert_warn(sdp, al->al_requested))
                return -EINVAL;
@@ -1204,17 +1209,27 @@ try_again:
        if (error)
                return error;
-        inode = get_local_rgrp(ip, &last_unlinked);
+        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
-        if (inode) {
+           dinodes along the way, error will equal -EAGAIN and unlinked will
+           contains it block address. We then need to look up that inode and
+           try to free it, and try the allocation again. */
+        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
+        if (error) {
                if (ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
-                if (IS_ERR(inode))
+                if (error != -EAGAIN)
-                        return PTR_ERR(inode);
+                        return error;
-                iput(inode);
+                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
+                /* regardless of whether or not gfs2_process_unlinked_inode
+                   was successful, we don't want to repeat it again. */
+                last_unlinked = unlinked;
                gfs2_log_flush(sdp, NULL);
+                error = 0;
                goto try_again;
        }
+        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 50aac606b990..4d1aad38f1b1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1113,7 +1113,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        int error;
        spin_lock(&gt->gt_spin);
-        args.ar_commit = gt->gt_log_flush_secs;
+        args.ar_commit = gt->gt_logd_secs;
        args.ar_quota_quantum = gt->gt_quota_quantum;
        if (gt->gt_statfs_slow)
                args.ar_statfs_quantum = 0;
@@ -1160,7 +1160,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        else
                clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
        spin_lock(&gt->gt_spin);
-        gt->gt_log_flush_secs = args.ar_commit;
+        gt->gt_logd_secs = args.ar_commit;
        gt->gt_quota_quantum = args.ar_quota_quantum;
        if (args.ar_statfs_quantum) {
                gt->gt_statfs_slow = 0;
@@ -1305,8 +1305,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (args->ar_discard)
                seq_printf(s, ",discard");
-        val = sdp->sd_tune.gt_log_flush_secs;
+        val = sdp->sd_tune.gt_logd_secs;
-        if (val != 60)
+        if (val != 30)
                seq_printf(s, ",commit=%d", val);
        val = sdp->sd_tune.gt_statfs_quantum;
        if (val != 30)
@@ -1334,7 +1334,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        }
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                seq_printf(s, ",nobarrier");
+        if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
+                seq_printf(s, ",demote_interface_used");
        return 0;
 }
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
-extern struct xattr_handler *gfs2_xattr_handlers[];
+extern const struct xattr_handler *gfs2_xattr_handlers[];
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 54fd98425991..37f5393e68e6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -232,6 +232,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        glops = gfs2_glops_list[gltype];
        if (glops == NULL)
                return -EINVAL;
+        if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
+                fs_info(sdp, "demote interface used\n");
        rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
        if (rv)
                return rv;
@@ -468,8 +470,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(incore_log_blocks, 0);
-TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
@@ -481,8 +481,6 @@ TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_incore_log_blocks.attr,
-        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
        &tune_attr_max_readahead.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 4ef0e9fa3549..9ec73a854111 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -23,6 +23,7 @@
 #include "meta_io.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                     unsigned int revokes)
@@ -75,6 +76,23 @@ fail_holder_uninit:
        return error;
 }
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+        atomic_add(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, blks);
+        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+                                  sdp->sd_jdesc->jd_blocks);
+        up_read(&sdp->sd_log_flush_lock);
+}
 void gfs2_trans_end(struct gfs2_sbd *sdp)
 {
        struct gfs2_trans *tr = current->journal_info;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
        return error;
 }
-static struct xattr_handler gfs2_xattr_user_handler = {
+static const struct xattr_handler gfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = GFS2_EATYPE_USR,
        .get    = gfs2_xattr_get,
        .set    = gfs2_xattr_set,
 };
-static struct xattr_handler gfs2_xattr_security_handler = {
+static const struct xattr_handler gfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .flags  = GFS2_EATYPE_SECURITY,
        .get    = gfs2_xattr_get,
        .set    = gfs2_xattr_set,
 };
-struct xattr_handler *gfs2_xattr_handlers[] = {
+const struct xattr_handler *gfs2_xattr_handlers[] = {
        &gfs2_xattr_user_handler,
        &gfs2_xattr_security_handler,
        &gfs2_xattr_system_handler,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
        .llseek         = generic_file_llseek,
        .release        = hfsplus_dir_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
 /* ioctl.c */
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-                  unsigned long arg);
 int hfsplus_setxattr(struct dentry *dentry, const char *name,
                     const void *value, size_t size, int flags);
 ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
        .fsync          = file_fsync,
        .open           = hfsplus_file_open,
        .release        = hfsplus_file_release,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
 };
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                  unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
+        lock_kernel();
        switch (cmd) {
        case HFSPLUS_IOC_EXT2_GETFLAGS:
                flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
        case HFSPLUS_IOC_EXT2_SETFLAGS: {
                int err = 0;
                err = mnt_want_write(filp->f_path.mnt);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
                if (!is_owner_or_cap(inode)) {
                        err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                mark_inode_dirty(inode);
 setflags_out:
                mnt_drop_write(filp->f_path.mnt);
+                unlock_kernel();
                return err;
        }
        default:
+                unlock_kernel();
                return -ENOTTY;
        }
 }
diff --git a/fs/inode.c b/fs/inode.c
index 258ec22bb298..2bee20ae3d65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
 */
 void __iget(struct inode *inode)
 {
-        if (atomic_read(&inode->i_count)) {
+        if (atomic_inc_return(&inode->i_count) != 1)
-                atomic_inc(&inode->i_count);
                return;
-        }
-        atomic_inc(&inode->i_count);
        if (!(inode->i_state & (I_DIRTY|I_SYNC)))
                list_move(&inode->i_list, &inode_in_use);
        inodes_stat.nr_unused--;
@@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
                                  inode->i_ino);
 }
 EXPORT_SYMBOL(init_special_inode);
+/**
+ * Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+                        mode_t mode)
+{
+        inode->i_uid = current_fsuid();
+        if (dir && dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
 * super.c
 */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+extern void __put_super(struct super_block *sb);
+extern void put_super(struct super_block *sb);
 /*
 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da939..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
        if (sb->s_op->freeze_fs == NULL)
                return -EOPNOTSUPP;
-        /* If a blockdevice-backed filesystem isn't specified, return. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
        /* Freeze */
-        sb = freeze_bdev(sb->s_bdev);
+        return freeze_super(sb);
-        if (IS_ERR(sb))
-                return PTR_ERR(sb);
-        return 0;
 }
 static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-        if (sb->s_bdev == NULL)
-                return -EINVAL;
        /* Thaw */
-        return thaw_bdev(sb->s_bdev, sb);
+        return thaw_super(sb);
 }
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ecb44c94ba8d..28a9ddaa0c49 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -786,6 +786,12 @@ wait_for_iobuf:
        jbd_debug(3, "JBD: commit phase 6\n");
+        /* All metadata is written, now write commit record and do cleanup */
+        spin_lock(&journal->j_state_lock);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        commit_transaction->t_state = T_COMMIT_RECORD;
+        spin_unlock(&journal->j_state_lock);
        if (journal_write_commit_record(journal, commit_transaction))
                err = -EIO;
@@ -923,7 +929,7 @@ restart_loop:
        jbd_debug(3, "JBD: commit phase 8\n");
-        J_ASSERT(commit_transaction->t_state == T_COMMIT);
+        J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
        commit_transaction->t_state = T_FINISHED;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd224eec9b07..93d1e47647bd 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -565,6 +565,38 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 }
 /*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+        int ret = 0;
+        transaction_t *commit_trans;
+        if (!(journal->j_flags & JFS_BARRIER))
+                return 0;
+        spin_lock(&journal->j_state_lock);
+        /* Transaction already committed? */
+        if (tid_geq(journal->j_commit_sequence, tid))
+                goto out;
+        /*
+         * Transaction is being committed and we already proceeded to
+         * writing commit record?
+         */
+        commit_trans = journal->j_committing_transaction;
+        if (commit_trans && commit_trans->t_tid == tid &&
+            commit_trans->t_state >= T_COMMIT_RECORD)
+                goto out;
+        ret = 1;
+out:
+        spin_unlock(&journal->j_state_lock);
+        return ret;
+}
+EXPORT_SYMBOL(journal_trans_will_send_data_barrier);
+/*
 * Log buffer allocation routines:
 */
@@ -1157,6 +1189,7 @@ int journal_destroy(journal_t *journal)
 {
        int err = 0;
+        
        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
         */
        if ((journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
        if (commit_transaction->t_flushed_data_blocks &&
            (journal->j_fs_dev != journal->j_dev) &&
            (journal->j_flags & JBD2_BARRIER))
-                blkdev_issue_flush(journal->j_fs_dev, NULL);
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
                if (err)
                        __jbd2_journal_abort_hard(journal);
                if (journal->j_flags & JBD2_BARRIER)
-                        blkdev_issue_flush(journal->j_dev, NULL);
+                        blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+                                BLKDEV_IFL_WAIT);
        }
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..a33aab6b5e68 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
        return rc;
 }
-struct xattr_handler jffs2_acl_access_xattr_handler = {
+const struct xattr_handler jffs2_acl_access_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_access_listxattr,
@@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
        .set    = jffs2_acl_setxattr,
 };
-struct xattr_handler jffs2_acl_default_xattr_handler = {
+const struct xattr_handler jffs2_acl_default_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
-extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern const struct xattr_handler jffs2_acl_access_xattr_handler;
-extern struct xattr_handler jffs2_acl_default_xattr_handler;
+extern const struct xattr_handler jffs2_acl_default_xattr_handler;
 #else
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3ff50da94789..55f1dde2fa8b 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -23,10 +23,9 @@ static int jffs2_garbage_collect_thread(void *);
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
 {
-        spin_lock(&c->erase_completion_lock);
+        assert_spin_locked(&c->erase_completion_lock);
        if (c->gc_task && jffs2_thread_should_wake(c))
                send_sig(SIGHUP, c->gc_task, 1);
-        spin_unlock(&c->erase_completion_lock);
 }
 /* This must only ever be called when no GC thread is currently running */
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index b47679be118a..6286ad9b00f7 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -103,9 +103,10 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        jffs2_erase_failed(c, jeb, bad_offset);
 }
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 {
        struct jffs2_eraseblock *jeb;
+        int work_done = 0;
        mutex_lock(&c->erase_free_sem);
@@ -121,6 +122,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
                        mutex_unlock(&c->erase_free_sem);
                        jffs2_mark_erased_block(c, jeb);
+                        work_done++;
                        if (!--count) {
                                D1(printk(KERN_DEBUG "Count reached. jffs2_erase_pending_blocks leaving\n"));
                                goto done;
@@ -157,6 +159,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
        mutex_unlock(&c->erase_free_sem);
 done:
        D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n"));
+        return work_done;
 }
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
@@ -165,10 +168,11 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
        list_move_tail(&jeb->list, &c->erase_complete_list);
+        /* Wake the GC thread to mark them clean */
+        jffs2_garbage_collect_trigger(c);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
-        /* Ensure that kupdated calls us again to mark them clean */
+        wake_up(&c->erase_wait);
-        jffs2_erase_pending_trigger(c);
 }
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset)
@@ -487,9 +491,9 @@ filebad:
 refile:
        /* Stick it back on the list from whence it came and come back later */
-        jffs2_erase_pending_trigger(c);
        mutex_lock(&c->erase_free_sem);
        spin_lock(&c->erase_completion_lock);
+        jffs2_garbage_collect_trigger(c);
        list_move(&jeb->list, &c->erase_complete_list);
        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->erase_free_sem);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 3451a81b2142..86e0821fc989 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -313,8 +313,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
        case S_IFBLK:
        case S_IFCHR:
                /* Read the device numbers from the media */
-                if (f->metadata->size != sizeof(jdev.old) &&
+                if (f->metadata->size != sizeof(jdev.old_id) &&
-                    f->metadata->size != sizeof(jdev.new)) {
+                    f->metadata->size != sizeof(jdev.new_id)) {
                        printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
                        goto error_io;
                }
@@ -325,10 +325,10 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
                        printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
                        goto error;
                }
-                if (f->metadata->size == sizeof(jdev.old))
+                if (f->metadata->size == sizeof(jdev.old_id))
-                        rdev = old_decode_dev(je16_to_cpu(jdev.old));
+                        rdev = old_decode_dev(je16_to_cpu(jdev.old_id));
                else
-                        rdev = new_decode_dev(je32_to_cpu(jdev.new));
+                        rdev = new_decode_dev(je32_to_cpu(jdev.new_id));
        case S_IFSOCK:
        case S_IFIFO:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 3b6f2fa12cff..f5e96bd656e8 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -214,6 +214,19 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                return ret;
        }
+        /* If there are any blocks which need erasing, erase them now */
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list)) {
+                spin_unlock(&c->erase_completion_lock);
+                D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n"));
+                if (jffs2_erase_pending_blocks(c, 1)) {
+                        mutex_unlock(&c->alloc_sem);
+                        return 0;
+                }
+                D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n"));
+                spin_lock(&c->erase_completion_lock);
+        }
        /* First, work out which block we're garbage-collecting */
        jeb = c->gcblock;
@@ -222,7 +235,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
        if (!jeb) {
                /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */
-                if (!list_empty(&c->erase_pending_list)) {
+                if (c->nr_erasing_blocks) {
                        spin_unlock(&c->erase_completion_lock);
                        mutex_unlock(&c->alloc_sem);
                        return -EAGAIN;
@@ -435,7 +448,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
                list_add_tail(&c->gcblock->list, &c->erase_pending_list);
                c->gcblock = NULL;
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 507ed6ec1847..a881a42f19e3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -312,11 +312,11 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 {
        if (old_valid_dev(rdev)) {
-                jdev->old = cpu_to_je16(old_encode_dev(rdev));
+                jdev->old_id = cpu_to_je16(old_encode_dev(rdev));
-                return sizeof(jdev->old);
+                return sizeof(jdev->old_id);
        } else {
-                jdev->new = cpu_to_je32(new_encode_dev(rdev));
+                jdev->new_id = cpu_to_je32(new_encode_dev(rdev));
-                return sizeof(jdev->new);
+                return sizeof(jdev->new_id);
        }
 }
@@ -464,7 +464,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 /* erase.c */
-void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 191359dde4e1..694aa5b03505 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -116,9 +116,21 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                        ret = jffs2_garbage_collect_pass(c);
-                        if (ret == -EAGAIN)
+                        if (ret == -EAGAIN) {
-                                jffs2_erase_pending_blocks(c, 1);
+                                spin_lock(&c->erase_completion_lock);
-                        else if (ret)
+                                if (c->nr_erasing_blocks &&
+                                    list_empty(&c->erase_pending_list) &&
+                                    list_empty(&c->erase_complete_list)) {
+                                        DECLARE_WAITQUEUE(wait, current);
+                                        set_current_state(TASK_UNINTERRUPTIBLE);
+                                        add_wait_queue(&c->erase_wait, &wait);
+                                        D1(printk(KERN_DEBUG "%s waiting for erase to complete\n", __func__));
+                                        spin_unlock(&c->erase_completion_lock);
+                                        schedule();
+                                } else
+                                        spin_unlock(&c->erase_completion_lock);
+                        } else if (ret)
                                return ret;
                        cond_resched();
@@ -217,7 +229,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
                        ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
                        list_move_tail(&ejeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                        D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
                                  ejeb->offset));
                }
@@ -469,7 +481,9 @@ struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
 void jffs2_complete_reservation(struct jffs2_sb_info *c)
 {
        D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n"));
+        spin_lock(&c->erase_completion_lock);
        jffs2_garbage_collect_trigger(c);
+        spin_unlock(&c->erase_completion_lock);
        mutex_unlock(&c->alloc_sem);
 }
@@ -611,7 +625,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
                                D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                                list_add_tail(&jeb->list, &c->erase_pending_list);
                                c->nr_erasing_blocks++;
-                                jffs2_erase_pending_trigger(c);
+                                jffs2_garbage_collect_trigger(c);
                        } else {
                                /* Sometimes, however, we leave it elsewhere so it doesn't get
                                   immediately reused, and we spread the load a bit. */
@@ -732,6 +746,10 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
        int nr_very_dirty = 0;
        struct jffs2_eraseblock *jeb;
+        if (!list_empty(&c->erase_complete_list) ||
+            !list_empty(&c->erase_pending_list))
+                return 1;
        if (c->unchecked_size) {
                D1(printk(KERN_DEBUG "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
                          c->unchecked_size, c->checked_ino));
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index a7f03b7ebcb3..035a767f958b 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -140,8 +140,7 @@ void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 #endif /* WRITEBUFFER */
-/* erase.c */
+static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
 {
        OFNI_BS_2SFFJ(c)->s_dirt = 1;
 }
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 696686cc206e..46f870d1cc36 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -260,7 +260,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                        ret = -EIO;
                        goto out;
                }
-                jffs2_erase_pending_trigger(c);
+                spin_lock(&c->erase_completion_lock);
+                jffs2_garbage_collect_trigger(c);
+                spin_unlock(&c->erase_completion_lock);
        }
        ret = 0;
 out:
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_security_xattr_handler = {
+const struct xattr_handler jffs2_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list = jffs2_security_listxattr,
        .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9a80e8e595d0..511e2d609d12 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,8 +63,6 @@ static void jffs2_write_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-                jffs2_garbage_collect_trigger(c);
-                jffs2_erase_pending_blocks(c, 0);
                jffs2_flush_wbuf_gc(c, 0);
        }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 5ef7bac265e5..07ee1546b2fa 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -84,7 +84,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
        struct jffs2_inodirty *new;
        /* Mark the superblock dirty so that kupdated will flush... */
-        jffs2_erase_pending_trigger(c);
+        jffs2_dirty_trigger(c);
        if (jffs2_wbuf_pending_for_ino(c, ino))
                return;
@@ -121,7 +121,7 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c)
                        D1(printk(KERN_DEBUG "...and adding to erase_pending_list\n"));
                        list_add_tail(&jeb->list, &c->erase_pending_list);
                        c->nr_erasing_blocks++;
-                        jffs2_erase_pending_trigger(c);
+                        jffs2_garbage_collect_trigger(c);
                } else {
                        /* Sometimes, however, we leave it elsewhere so it doesn't get
                           immediately reused, and we spread the load a bit. */
@@ -152,7 +152,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
                D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset));
                list_add(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
@@ -543,7 +543,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
                list_move(&jeb->list, &c->erase_pending_list);
                c->nr_erasing_blocks++;
-                jffs2_erase_pending_trigger(c);
+                jffs2_garbage_collect_trigger(c);
        }
        jffs2_dbg_acct_sanity_check_nolock(c, jeb);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..a2d58c96f1b4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
 *   is an implementation of setxattr handler on jffs2.
 * -------------------------------------------------- */
-struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler *jffs2_xattr_handlers[] = {
        &jffs2_user_xattr_handler,
 #ifdef CONFIG_JFFS2_FS_SECURITY
        &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *xprefix_to_handler(int xprefix) {
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
-        struct xattr_handler *ret;
+        const struct xattr_handler *ret;
        switch (xprefix) {
        case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct jffs2_inode_cache *ic = f->inocache;
        struct jffs2_xattr_ref *ref, **pref;
        struct jffs2_xattr_datum *xd;
-        struct xattr_handler *xhandle;
+        const struct xattr_handler *xhandle;
        ssize_t len, rc;
        int retry = 0;
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
 extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
                             const char *buffer, size_t size, int flags);
-extern struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler *jffs2_xattr_handlers[];
-extern struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_user_xattr_handler;
-extern struct xattr_handler jffs2_trusted_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
 extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #define jffs2_getxattr          generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #ifdef CONFIG_JFFS2_FS_SECURITY
 extern int jffs2_init_security(struct inode *inode, struct inode *dir);
-extern struct xattr_handler jffs2_security_xattr_handler;
+extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
 #define jffs2_init_security(inode,dir)  (0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_trusted_xattr_handler = {
+const struct xattr_handler jffs2_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list = jffs2_trusted_listxattr,
        .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
        return retlen;
 }
-struct xattr_handler jffs2_user_xattr_handler = {
+const struct xattr_handler jffs2_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list = jffs2_user_listxattr,
        .set = jffs2_user_setxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 14ba982b3f24..85d9ec659225 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -98,7 +98,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (rc)
                return rc;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                goto fail_unlock;
        }
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, parent, mode);
-        if (parent->i_mode & S_ISGID) {
-                inode->i_gid = parent->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
        /*
         * New inodes need to save sane values on disk when
         * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        if (rc)
                goto fail_drop;
-        inode->i_mode = mode;
        /* inherit flags from parent */
        jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                if (S_ISLNK(mode))
                        jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
        }
-        jfs_inode->mode2 |= mode;
+        jfs_inode->mode2 |= inode->i_mode;
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 755a92e8daa7..f602e230e162 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
        inode->i_mode = mode;
        logfs_set_ino_generation(sb, inode);
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = current_fsgid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        inode->i_mode |= S_ISGID;
-        }
        logfs_inode_setops(inode);
        insert_inode_hash(inode);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
        clear_inode(inode);             /* clear in-memory copy */
 }
-struct inode * minix_new_inode(const struct inode * dir, int * error)
+struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
 {
        struct super_block *sb = dir->i_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
                iput(inode);
                return NULL;
        }
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_ino = j;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
 extern struct inode *minix_iget(struct super_block *, unsigned long);
 extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
 extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode * dir, int * error);
+extern struct inode * minix_new_inode(const struct inode *, int, int *);
 extern void minix_free_inode(struct inode * inode);
 extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
 extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        inode = minix_new_inode(dir, &error);
+        inode = minix_new_inode(dir, mode, &error);
        if (inode) {
-                inode->i_mode = mode;
                minix_set_inode(inode, rdev);
                mark_inode_dirty(inode);
                error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
        if (i > dir->i_sb->s_blocksize)
                goto out;
-        inode = minix_new_inode(dir, &err);
+        inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
        if (!inode)
                goto out;
-        inode->i_mode = S_IFLNK | 0777;
        minix_set_inode(inode, 0);
        err = page_symlink(inode, symname, i);
        if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
        inode_inc_link_count(dir);
-        inode = minix_new_inode(dir, &err);
+        inode = minix_new_inode(dir, mode, &err);
        if (!inode)
                goto out_dir;
-        inode->i_mode = S_IFDIR | mode;
-        if (dir->i_mode & S_ISGID)
-                inode->i_mode |= S_ISGID;
        minix_set_inode(inode, 0);
        inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc3..48e1f60520ea 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
 static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 {
        dput(nd->path.dentry);
-        if (nd->path.mnt != path->mnt)
+        if (nd->path.mnt != path->mnt) {
                mntput(nd->path.mnt);
-        nd->path.mnt = path->mnt;
+                nd->path.mnt = path->mnt;
+        }
        nd->path.dentry = path->dentry;
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..92dde6f8d893 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -51,7 +51,7 @@ const struct file_operations ncp_dir_operations =
 {
        .read           = generic_read_dir,
        .readdir        = ncp_readdir,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..b93870892892 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-static int __ncp_ioctl(struct inode *inode, struct file *filp,
+static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        int result;
        struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        }
 }
-int ncp_ioctl(struct inode *inode, struct file *filp,
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
-        int ret;
+        long ret;
+        lock_kernel();
        if (ncp_ioctl_need_write(cmd)) {
                /*
                 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
                 * -EACCESS, so it seems consistent to keep
                 *  that here.
                 */
-                if (mnt_want_write(filp->f_path.mnt))
+                if (mnt_want_write(filp->f_path.mnt)) {
-                        return -EACCES;
+                        ret = -EACCES;
+                        goto out;
+                }
        }
-        ret = __ncp_ioctl(inode, filp, cmd, arg);
+        ret = __ncp_ioctl(filp, cmd, arg);
        if (ncp_ioctl_need_write(cmd))
                mnt_drop_write(filp->f_path.mnt);
+out:
+        unlock_kernel();
        return ret;
 }
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        long ret;
-        int ret;
        lock_kernel();
        arg = (unsigned long) compat_ptr(arg);
-        ret = ncp_ioctl(inode, file, cmd, arg);
+        ret = ncp_ioctl(file, cmd, arg);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4b..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 /* Globals */
-static struct path rec_dir;
+static struct file *rec_file;
-static int rec_dir_init = 0;
 static int
 nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
        return status;
 }
-static void
-nfsd4_sync_rec_dir(void)
-{
-        vfs_fsync(NULL, rec_dir.dentry, 0);
-}
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
        const struct cred *original_cred;
        char *dname = clp->cl_recdir;
-        struct dentry *dentry;
+        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
-        if (!rec_dir_init || clp->cl_firststate)
+        if (!rec_file || clp->cl_firststate)
                return 0;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
+        dir = rec_file->f_path.dentry;
        /* lock the parent */
-        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_lock(&dir->d_inode->i_mutex);
-        dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
+        dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
                goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
                goto out_put;
        }
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out_put;
-        status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+        status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out_put:
        dput(dentry);
 out_unlock:
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_unlock(&dir->d_inode->i_mutex);
        if (status == 0) {
                clp->cl_firststate = 1;
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
        }
        nfs4_reset_creds(original_cred);
        dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
        struct dentry *dentry;
        int status;
-        if (!rec_dir_init)
+        if (!rec_file)
                return 0;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
                return status;
-        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+        filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
 static int
 nfsd4_unlink_clid_dir(char *name, int namlen)
 {
-        struct dentry *dentry;
+        struct dentry *dir, *dentry;
        int status;
        dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
-        mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+        dir = rec_file->f_path.dentry;
-        dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+        dentry = lookup_one_len(name, dir, namlen);
        if (IS_ERR(dentry)) {
                status = PTR_ERR(dentry);
                goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
        status = -ENOENT;
        if (!dentry->d_inode)
                goto out;
-        status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
+        status = vfs_rmdir(dir->d_inode, dentry);
 out:
        dput(dentry);
 out_unlock:
-        mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+        mutex_unlock(&dir->d_inode->i_mutex);
        return status;
 }
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        const struct cred *original_cred;
        int status;
-        if (!rec_dir_init || !clp->cl_firststate)
+        if (!rec_file || !clp->cl_firststate)
                return;
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out;
        clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
        nfs4_reset_creds(original_cred);
        if (status == 0)
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out:
        if (status)
                printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
 nfsd4_recdir_purge_old(void) {
        int status;
-        if (!rec_dir_init)
+        if (!rec_file)
                return;
-        status = mnt_want_write(rec_dir.mnt);
+        status = mnt_want_write(rec_file->f_path.mnt);
        if (status)
                goto out;
-        status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
+        status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
        if (status == 0)
-                nfsd4_sync_rec_dir();
+                vfs_fsync(rec_file, 0);
-        mnt_drop_write(rec_dir.mnt);
+        mnt_drop_write(rec_file->f_path.mnt);
 out:
        if (status)
                printk("nfsd4: failed to purge old clients from recovery"
-                        " directory %s\n", rec_dir.dentry->d_name.name);
+                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
 }
 static int
@@ -355,10 +350,13 @@ int
 nfsd4_recdir_load(void) {
        int status;
-        status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
+        if (!rec_file)
+                return 0;
+        status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
        if (status)
                printk("nfsd4: failed loading clients from recovery"
-                        " directory %s\n", rec_dir.dentry->d_name.name);
+                        " directory %s\n", rec_file->f_path.dentry->d_name.name);
        return status;
 }
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
        printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
                        rec_dirname);
-        BUG_ON(rec_dir_init);
+        BUG_ON(rec_file);
        status = nfs4_save_creds(&original_cred);
        if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
                return;
        }
-        status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+        rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
-                        &rec_dir);
+        if (IS_ERR(rec_file)) {
-        if (status)
                printk("NFSD: unable to find recovery directory %s\n",
                                rec_dirname);
+                rec_file = NULL;
+        }
-        if (!status)
-                rec_dir_init = 1;
        nfs4_reset_creds(original_cred);
 }
 void
 nfsd4_shutdown_recdir(void)
 {
-        if (!rec_dir_init)
+        if (!rec_file)
                return;
-        rec_dir_init = 0;
+        fput(rec_file);
-        path_put(&rec_dir);
+        rec_file = NULL;
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX)
+        if (port < 1 || port > USHRT_MAX)
                return -EINVAL;
        err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+        if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
                return -EINVAL;
        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23c06f77f4ca..ebbf3b6b2457 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file)
        if (inode->i_state & I_DIRTY) {
                dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-                err = vfs_fsync(file, file->f_path.dentry, 0);
+                err = vfs_fsync(file, 0);
        }
        last_ino = inode->i_ino;
        last_dev = inode->i_sb->s_dev;
@@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (err)
                goto out;
        if (EX_ISSYNC(fhp->fh_export)) {
-                int err2 = vfs_fsync_range(file, file->f_path.dentry,
+                int err2 = vfs_fsync_range(file, offset, end, 0);
-                                offset, end, 0);
                if (err2 != -EINVAL)
                        err = nfserrno(err2);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7cfb87e692da..d7fd696e595c 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -31,6 +31,11 @@
 #include "alloc.h"
+/**
+ * nilfs_palloc_groups_per_desc_block - get the number of groups that a group
+ *                                      descriptor block can maintain
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_per_desc_block(const struct inode *inode)
 {
@@ -38,12 +43,21 @@ nilfs_palloc_groups_per_desc_block(const struct inode *inode)
                sizeof(struct nilfs_palloc_group_desc);
 }
+/**
+ * nilfs_palloc_groups_count - get maximum number of groups
+ * @inode: inode of metadata file using this allocator
+ */
 static inline unsigned long
 nilfs_palloc_groups_count(const struct inode *inode)
 {
        return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
 }
+/**
+ * nilfs_palloc_init_blockgroup - initialize private variables for allocator
+ * @inode: inode of metadata file using this allocator
+ * @entry_size: size of the persistent object
+ */
 int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
@@ -69,6 +83,12 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
        return 0;
 }
+/**
+ * nilfs_palloc_group - get group number and offset from an entry number
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @offset: pointer to store offset number in the group
+ */
 static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
                                        unsigned long *offset)
 {
@@ -78,6 +98,14 @@ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
        return group;
 }
+/**
+ * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_desc_blkoff() returns block offset of the descriptor
+ * block which contains a descriptor of the specified group.
+ */
 static unsigned long
 nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -86,6 +114,14 @@ nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
        return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
 }
+/**
+ * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ *
+ * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap
+ * block used to allocate/deallocate entries in the specified group.
+ */
 static unsigned long
 nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
 {
@@ -95,6 +131,12 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
                desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
 }
+/**
+ * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ */
 static unsigned long
 nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
                               const struct nilfs_palloc_group_desc *desc)
@@ -107,6 +149,13 @@ nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
        return nfree;
 }
+/**
+ * nilfs_palloc_group_desc_add_entries - adjust count of free entries
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @desc: pointer to descriptor structure for the group
+ * @n: delta to be added
+ */
 static void
 nilfs_palloc_group_desc_add_entries(struct inode *inode,
                                    unsigned long group,
@@ -118,6 +167,11 @@ nilfs_palloc_group_desc_add_entries(struct inode *inode,
        spin_unlock(nilfs_mdt_bgl_lock(inode, group));
 }
+/**
+ * nilfs_palloc_entry_blkoff - get block offset of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static unsigned long
 nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
 {
@@ -129,6 +183,12 @@ nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
                group_offset / NILFS_MDT(inode)->mi_entries_per_block;
 }
+/**
+ * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block
+ * @inode: inode of metadata file
+ * @bh: buffer head of the buffer to be initialized
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static void nilfs_palloc_desc_block_init(struct inode *inode,
                                         struct buffer_head *bh, void *kaddr)
 {
@@ -179,6 +239,13 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
        return ret;
 }
+/**
+ * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_desc_block(struct inode *inode,
                                       unsigned long group,
                                       int create, struct buffer_head **bhp)
@@ -191,6 +258,13 @@ static int nilfs_palloc_get_desc_block(struct inode *inode,
                                      bhp, &cache->prev_desc, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                         unsigned long group,
                                         int create, struct buffer_head **bhp)
@@ -203,6 +277,13 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
                                      &cache->prev_bitmap, &cache->lock);
 }
+/**
+ * nilfs_palloc_get_entry_block - get buffer head of an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @create: create flag
+ * @bhp: pointer to store the resultant buffer head
+ */
 int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                 int create, struct buffer_head **bhp)
 {
@@ -214,6 +295,13 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
                                      &cache->prev_entry, &cache->lock);
 }
+/**
+ * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @bh: buffer head of the buffer storing the group descriptor block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 static struct nilfs_palloc_group_desc *
 nilfs_palloc_block_get_group_desc(const struct inode *inode,
                                  unsigned long group,
@@ -223,6 +311,13 @@ nilfs_palloc_block_get_group_desc(const struct inode *inode,
                group % nilfs_palloc_groups_per_desc_block(inode);
 }
+/**
+ * nilfs_palloc_block_get_entry - get kernel address of an entry
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry (e.g. inode number)
+ * @bh: buffer head of the buffer storing the entry block
+ * @kaddr: kernel address mapped for the page including the buffer
+ */
 void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                                   const struct buffer_head *bh, void *kaddr)
 {
@@ -235,11 +330,19 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
                entry_offset * NILFS_MDT(inode)->mi_entry_size;
 }
+/**
+ * nilfs_palloc_find_available_slot - find available slot in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @target: offset number of an entry in the group (start point)
+ * @bitmap: bitmap of the group
+ * @bsize: size in bits
+ */
 static int nilfs_palloc_find_available_slot(struct inode *inode,
                                            unsigned long group,
                                            unsigned long target,
                                            unsigned char *bitmap,
-                                            int bsize)  /* size in bits */
+                                            int bsize)
 {
        int curr, pos, end, i;
@@ -277,6 +380,13 @@ static int nilfs_palloc_find_available_slot(struct inode *inode,
        return -ENOSPC;
 }
+/**
+ * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups
+ *                                          in a group descriptor block
+ * @inode: inode of metadata file using this allocator
+ * @curr: current group number
+ * @max: maximum number of groups
+ */
 static unsigned long
 nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                                       unsigned long curr, unsigned long max)
@@ -287,6 +397,11 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
                     max - curr + 1);
 }
+/**
+ * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -366,6 +481,11 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
        return ret;
 }
+/**
+ * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_commit_alloc_entry(struct inode *inode,
                                     struct nilfs_palloc_req *req)
 {
@@ -377,6 +497,11 @@ void nilfs_palloc_commit_alloc_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_commit_free_entry - finish deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -410,6 +535,11 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
        brelse(req->pr_desc_bh);
 }
+/**
+ * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the allocation
+ */
 void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -442,6 +572,11 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 int nilfs_palloc_prepare_free_entry(struct inode *inode,
                                    struct nilfs_palloc_req *req)
 {
@@ -464,6 +599,11 @@ int nilfs_palloc_prepare_free_entry(struct inode *inode,
        return 0;
 }
+/**
+ * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object
+ * @inode: inode of metadata file using this allocator
+ * @req: nilfs_palloc_req structure exchanged for the removal
+ */
 void nilfs_palloc_abort_free_entry(struct inode *inode,
                                   struct nilfs_palloc_req *req)
 {
@@ -475,6 +615,12 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
        req->pr_desc_bh = NULL;
 }
+/**
+ * nilfs_palloc_group_is_in - judge if an entry is in a group
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ * @nr: serial number of the entry (e.g. inode number)
+ */
 static int
 nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
 {
@@ -485,6 +631,12 @@ nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
        return (nr >= first) && (nr <= last);
 }
+/**
+ * nilfs_palloc_freev - deallocate a set of persistent objects
+ * @inode: inode of metadata file using this allocator
+ * @entry_nrs: array of entry numbers to be deallocated
+ * @nitems: number of entries stored in @entry_nrs
+ */
 int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 {
        struct buffer_head *desc_bh, *bitmap_bh;
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 5cccf874d692..9af34a7e6e13 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -29,6 +29,13 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
+/**
+ * nilfs_palloc_entries_per_group - get the number of entries per group
+ * @inode: inode of metadata file using this allocator
+ *
+ * The number of entries per group is defined by the number of bits
+ * that a bitmap block can maintain.
+ */
 static inline unsigned long
 nilfs_palloc_entries_per_group(const struct inode *inode)
 {
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 76c38e3e19d2..b27a342c5af6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,63 +31,16 @@
 #include "alloc.h"
 #include "dat.h"
-/**
+static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
- * struct nilfs_btree_path - A path on which B-tree operations are executed
- * @bp_bh: buffer head of node block
- * @bp_sib_bh: buffer head of sibling node block
- * @bp_index: index of child node
- * @bp_oldreq: ptr end request for old ptr
- * @bp_newreq: ptr alloc request for new ptr
- * @bp_op: rebalance operation
- */
-struct nilfs_btree_path {
-        struct buffer_head *bp_bh;
-        struct buffer_head *bp_sib_bh;
-        int bp_index;
-        union nilfs_bmap_ptr_req bp_oldreq;
-        union nilfs_bmap_ptr_req bp_newreq;
-        struct nilfs_btnode_chkey_ctxt bp_ctxt;
-        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
-                      int, __u64 *, __u64 *);
-};
-/*
- * B-tree path operations
- */
-static struct kmem_cache *nilfs_btree_path_cache;
-int __init nilfs_btree_path_cache_init(void)
-{
-        nilfs_btree_path_cache =
-                kmem_cache_create("nilfs2_btree_path_cache",
-                                  sizeof(struct nilfs_btree_path) *
-                                  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
-        return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
-}
-void nilfs_btree_path_cache_destroy(void)
-{
-        kmem_cache_destroy(nilfs_btree_path_cache);
-}
-static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-{
-        return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-}
-static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        kmem_cache_free(nilfs_btree_path_cache, path);
+        struct nilfs_btree_path *path;
-}
+        int level = NILFS_BTREE_LEVEL_DATA;
-static void nilfs_btree_init_path(struct nilfs_btree_path *path)
+        path = kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-{
+        if (path == NULL)
-        int level;
+                goto out;
-        for (level = NILFS_BTREE_LEVEL_DATA;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++) {
-             level < NILFS_BTREE_LEVEL_MAX;
-             level++) {
                path[level].bp_bh = NULL;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index = 0;
@@ -95,15 +48,19 @@ static void nilfs_btree_init_path(struct nilfs_btree_path *path)
                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
                path[level].bp_op = NULL;
        }
+out:
+        return path;
 }
-static void nilfs_btree_release_path(struct nilfs_btree_path *path)
+static void nilfs_btree_free_path(struct nilfs_btree_path *path)
 {
-        int level;
+        int level = NILFS_BTREE_LEVEL_DATA;
-        for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+        for (; level < NILFS_BTREE_LEVEL_MAX; level++)
-             level++)
                brelse(path[level].bp_bh);
+        kmem_cache_free(nilfs_btree_path_cache, path);
 }
 /*
@@ -566,14 +523,12 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ptrp != NULL)
                *ptrp = ptr;
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -594,7 +549,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ret < 0)
                goto out;
@@ -655,7 +610,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        *ptrp = ptr;
        ret = cnt;
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1123,7 +1077,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1140,7 +1093,6 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1456,7 +1408,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
        if (ret < 0)
@@ -1473,7 +1425,6 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1488,11 +1439,9 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -1923,7 +1872,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1947,7 +1895,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -2108,7 +2055,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
@@ -2130,7 +2076,6 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
@@ -2175,7 +2120,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
        if (ret < 0) {
@@ -2195,7 +2139,6 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                nilfs_bmap_set_dirty(&btree->bt_bmap);
 out:
-        nilfs_btree_release_path(path);
        nilfs_btree_free_path(path);
        return ret;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4b82d84ade75..af638d59e3bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -30,9 +30,6 @@
 #include "btnode.h"
 #include "bmap.h"
-struct nilfs_btree;
-struct nilfs_btree_path;
 /**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
@@ -41,6 +38,25 @@ struct nilfs_btree {
        struct nilfs_bmap bt_bmap;
 };
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+        struct buffer_head *bp_bh;
+        struct buffer_head *bp_sib_bh;
+        int bp_index;
+        union nilfs_bmap_ptr_req bp_oldreq;
+        union nilfs_bmap_ptr_req bp_newreq;
+        struct nilfs_btnode_chkey_ctxt bp_ctxt;
+        void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+                      int, __u64 *, __u64 *);
+};
 #define NILFS_BTREE_ROOT_SIZE           NILFS_BMAP_SIZE
 #define NILFS_BTREE_ROOT_NCHILDREN_MAX                                  \
@@ -57,6 +73,7 @@ struct nilfs_btree {
 #define NILFS_BTREE_KEY_MIN     ((__u64)0)
 #define NILFS_BTREE_KEY_MAX     (~(__u64)0)
+extern struct kmem_cache *nilfs_btree_path_cache;
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0957b58f909d..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
        /* reference count of i_bh inherits from nilfs_mdt_read_block() */
        atomic_inc(&sbi->s_inodes_count);
+        inode_init_owner(inode, dir, mode);
-        inode->i_uid = current_fsuid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_ino = ino;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -451,7 +442,7 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
                inode->i_op = &nilfs_special_inode_operations;
                init_special_inode(
                        inode, inode->i_mode,
-                        new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+                        huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
        }
        nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
        brelse(bh);
@@ -511,7 +502,7 @@ void nilfs_write_inode_common(struct inode *inode,
                nilfs_bmap_write(ii->i_bmap, raw_inode);
        else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_device_code =
-                        cpu_to_le64(new_encode_dev(inode->i_rdev));
+                        cpu_to_le64(huge_encode_dev(inode->i_rdev));
        /* When extending inode, nilfs->ns_inode_size should be checked
           for substitutions of appended fields */
 }
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba43146f3c30..bae2a516b4ee 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -105,6 +105,8 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
        ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
        ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+        /* need to verify ->ss_bytes field if read ->ss_cno */
 }
 /**
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 17851f77f739..2e6a2723b8fa 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,35 +40,10 @@ struct nilfs_write_info {
        sector_t                blocknr;
 };
 static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                              struct the_nilfs *nilfs);
 static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-static struct kmem_cache *nilfs_segbuf_cachep;
-static void nilfs_segbuf_init_once(void *obj)
-{
-        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
-}
-int __init nilfs_init_segbuf_cache(void)
-{
-        nilfs_segbuf_cachep =
-                kmem_cache_create("nilfs2_segbuf_cache",
-                                  sizeof(struct nilfs_segment_buffer),
-                                  0, SLAB_RECLAIM_ACCOUNT,
-                                  nilfs_segbuf_init_once);
-        return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
-}
-void nilfs_destroy_segbuf_cache(void)
-{
-        kmem_cache_destroy(nilfs_segbuf_cachep);
-}
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
 {
        struct nilfs_segment_buffer *segbuf;
@@ -81,6 +56,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
        INIT_LIST_HEAD(&segbuf->sb_list);
        INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
        INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
        init_completion(&segbuf->sb_bio_event);
        atomic_set(&segbuf->sb_err, 0);
@@ -158,7 +134,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 }
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
-                       time_t ctime)
+                       time_t ctime, __u64 cno)
 {
        int err;
@@ -171,6 +147,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
        segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
        segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
        segbuf->sb_sum.ctime = ctime;
+        segbuf->sb_sum.cno = cno;
        return 0;
 }
@@ -196,13 +173,14 @@ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
        raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
        raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
        raw_sum->ss_pad      = 0;
+        raw_sum->ss_cno      = cpu_to_le64(segbuf->sb_sum.cno);
 }
 /*
 * CRC calculation routines
 */
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+static void
-                                     u32 seed)
+nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -229,8 +207,8 @@ void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_sumsum = cpu_to_le32(crc);
 }
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
-                                   u32 seed)
+                                          u32 seed)
 {
        struct buffer_head *bh;
        struct nilfs_segment_summary *raw_sum;
@@ -256,6 +234,20 @@ void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
        raw_sum->ss_datasum = cpu_to_le32(crc);
 }
+static void
+nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
+                                    u32 seed)
+{
+        struct nilfs_super_root *raw_sr;
+        u32 crc;
+        raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+        crc = crc32_le(seed,
+                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+        raw_sr->sr_sum = cpu_to_le32(crc);
+}
 static void nilfs_release_buffers(struct list_head *list)
 {
        struct buffer_head *bh, *n;
@@ -282,6 +274,7 @@ static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
 {
        nilfs_release_buffers(&segbuf->sb_segsum_buffers);
        nilfs_release_buffers(&segbuf->sb_payload_buffers);
+        segbuf->sb_super_root = NULL;
 }
 /*
@@ -334,6 +327,23 @@ int nilfs_wait_on_logs(struct list_head *logs)
        return ret;
 }
+/**
+ * nilfs_add_checksums_on_logs - add checksums on the logs
+ * @logs: list of segment buffers storing target logs
+ * @seed: checksum seed value
+ */
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed)
+{
+        struct nilfs_segment_buffer *segbuf;
+        list_for_each_entry(segbuf, logs, sb_list) {
+                if (segbuf->sb_super_root)
+                        nilfs_segbuf_fill_in_super_root_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+        }
+}
 /*
 * BIO operations
 */
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 94dfd3517bc0..fdf1c3b6d673 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -37,6 +37,7 @@
 * @sumbytes: Byte count of segment summary
 * @nfileblk: Total number of file blocks
 * @seg_seq: Segment sequence number
+ * @cno: Checkpoint number
 * @ctime: Creation time
 * @next: Block number of the next full segment
 */
@@ -48,6 +49,7 @@ struct nilfs_segsum_info {
        unsigned long           sumbytes;
        unsigned long           nfileblk;
        u64                     seg_seq;
+        __u64                   cno;
        time_t                  ctime;
        sector_t                next;
 };
@@ -76,6 +78,7 @@ struct nilfs_segsum_info {
 * @sb_rest_blocks: Number of residual blocks in the current segment
 * @sb_segsum_buffers: List of buffers for segment summaries
 * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_super_root: Pointer to buffer storing a super root block (if exists)
 * @sb_nbio: Number of flying bio requests
 * @sb_err: I/O error status
 * @sb_bio_event: Completion event of log writing
@@ -95,6 +98,7 @@ struct nilfs_segment_buffer {
        /* Buffers */
        struct list_head        sb_segsum_buffers;
        struct list_head        sb_payload_buffers; /* including super root */
+        struct buffer_head     *sb_super_root;
        /* io status */
        int                     sb_nbio;
@@ -121,6 +125,7 @@ struct nilfs_segment_buffer {
                    b_assoc_buffers))
 #define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+extern struct kmem_cache *nilfs_segbuf_cachep;
 int __init nilfs_init_segbuf_cache(void);
 void nilfs_destroy_segbuf_cache(void);
@@ -132,13 +137,11 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
                           struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
                                  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
                                struct buffer_head **);
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
-void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
-void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
 static inline void
 nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
@@ -171,6 +174,7 @@ void nilfs_truncate_logs(struct list_head *logs,
                         struct nilfs_segment_buffer *last);
 int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
 int nilfs_wait_on_logs(struct list_head *logs);
+void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed);
 static inline void nilfs_destroy_logs(struct list_head *logs)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6a7dbd8451db..c9201649cc49 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -116,42 +116,6 @@ static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
 #define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
 #define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
-/*
- * Transaction
- */
-static struct kmem_cache *nilfs_transaction_cachep;
-/**
- * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
- *
- * nilfs_init_transaction_cache() creates a slab cache for the struct
- * nilfs_transaction_info.
- *
- * Return Value: On success, it returns 0. On error, one of the following
- * negative error code is returned.
- *
- * %-ENOMEM - Insufficient memory available.
- */
-int nilfs_init_transaction_cache(void)
-{
-        nilfs_transaction_cachep =
-                kmem_cache_create("nilfs2_transaction_cache",
-                                  sizeof(struct nilfs_transaction_info),
-                                  0, SLAB_RECLAIM_ACCOUNT, NULL);
-        return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
-}
-/**
- * nilfs_destroy_transaction_cache - destroy the cache for transaction info
- *
- * nilfs_destroy_transaction_cache() frees the slab cache for the struct
- * nilfs_transaction_info.
- */
-void nilfs_destroy_transaction_cache(void)
-{
-        kmem_cache_destroy(nilfs_transaction_cachep);
-}
 static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
 {
        struct nilfs_transaction_info *cur_ti = current->journal_info;
@@ -402,7 +366,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
        if (nilfs_doing_gc())
                flags = NILFS_SS_GC;
-        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+        err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime,
+                                 sci->sc_sbi->s_nilfs->ns_cno);
        if (unlikely(err))
                return err;
@@ -435,7 +400,7 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
                        return err;
                segbuf = sci->sc_curseg;
        }
-        err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+        err = nilfs_segbuf_extend_payload(segbuf, &segbuf->sb_super_root);
        if (likely(!err))
                segbuf->sb_sum.flags |= NILFS_SS_SR;
        return err;
@@ -599,7 +564,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
        *vblocknr = binfo->bi_v.bi_vblocknr;
 }
-struct nilfs_sc_operations nilfs_sc_file_ops = {
+static struct nilfs_sc_operations nilfs_sc_file_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_file_bmap,
@@ -649,7 +614,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
        *binfo_dat = binfo->bi_dat;
 }
-struct nilfs_sc_operations nilfs_sc_dat_ops = {
+static struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .collect_data = nilfs_collect_dat_data,
        .collect_node = nilfs_collect_file_node,
        .collect_bmap = nilfs_collect_dat_bmap,
@@ -657,7 +622,7 @@ struct nilfs_sc_operations nilfs_sc_dat_ops = {
        .write_node_binfo = nilfs_write_dat_node_binfo,
 };
-struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+static struct nilfs_sc_operations nilfs_sc_dsync_ops = {
        .collect_data = nilfs_collect_file_data,
        .collect_node = NULL,
        .collect_bmap = NULL,
@@ -932,43 +897,16 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
        }
 }
-/*
- * CRC calculation routines
- */
-static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
-{
-        struct nilfs_super_root *raw_sr =
-                (struct nilfs_super_root *)bh_sr->b_data;
-        u32 crc;
-        crc = crc32_le(seed,
-                       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
-                       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
-        raw_sr->sr_sum = cpu_to_le32(crc);
-}
-static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
-                                            u32 seed)
-{
-        struct nilfs_segment_buffer *segbuf;
-        if (sci->sc_super_root)
-                nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
-        list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-                nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
-                nilfs_segbuf_fill_in_data_crc(segbuf, seed);
-        }
-}
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
-        struct buffer_head *bh_sr = sci->sc_super_root;
+        struct buffer_head *bh_sr;
-        struct nilfs_super_root *raw_sr =
+        struct nilfs_super_root *raw_sr;
-                (struct nilfs_super_root *)bh_sr->b_data;
        unsigned isz = nilfs->ns_inode_size;
+        bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+        raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
        raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
        raw_sr->sr_nongc_ctime
                = cpu_to_le64(nilfs_doing_gc() ?
@@ -1491,7 +1429,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
        /* Collection retry loop */
        for (;;) {
-                sci->sc_super_root = NULL;
                sci->sc_nblk_this_inc = 0;
                sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
@@ -1568,7 +1505,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
        ssp.offset = sizeof(struct nilfs_segment_summary);
        list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-                if (bh == sci->sc_super_root)
+                if (bh == segbuf->sb_super_root)
                        break;
                if (!finfo) {
                        finfo = nilfs_segctor_map_segsum_entry(
@@ -1729,7 +1666,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        lock_page(bd_page);
                                        clear_page_dirty_for_io(bd_page);
@@ -1848,7 +1785,7 @@ static void nilfs_clear_copied_buffers(struct list_head *list, int err)
 }
 static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
-                             struct buffer_head *bh_sr, int err)
+                             int err)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
@@ -1869,7 +1806,7 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
                list_for_each_entry(bh, &segbuf->sb_payload_buffers,
                                    b_assoc_buffers) {
-                        if (bh == bh_sr) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
@@ -1898,7 +1835,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        list_splice_tail_init(&sci->sc_write_logs, &logs);
        ret = nilfs_wait_on_logs(&logs);
-        nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
+        nilfs_abort_logs(&logs, NULL, ret ? : err);
        list_splice_tail_init(&sci->sc_segbufs, &logs);
        nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
@@ -1914,7 +1851,6 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
        }
        nilfs_destroy_logs(&logs);
-        sci->sc_super_root = NULL;
 }
 static void nilfs_set_next_segment(struct the_nilfs *nilfs,
@@ -1933,7 +1869,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        int update_sr = (sci->sc_super_root != NULL);
+        int update_sr = false;
        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
                struct buffer_head *bh;
@@ -1964,11 +1900,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
                        set_buffer_uptodate(bh);
                        clear_buffer_dirty(bh);
                        clear_buffer_nilfs_volatile(bh);
-                        if (bh == sci->sc_super_root) {
+                        if (bh == segbuf->sb_super_root) {
                                if (bh->b_page != bd_page) {
                                        end_page_writeback(bd_page);
                                        bd_page = bh->b_page;
                                }
+                                update_sr = true;
                                break;
                        }
                        if (bh->b_page != fs_page) {
@@ -2115,7 +2052,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct page *failed_page;
-        int err, has_sr = 0;
+        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2143,8 +2080,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (unlikely(err))
                        goto failed;
-                has_sr = (sci->sc_super_root != NULL);
                /* Avoid empty segment */
                if (sci->sc_stage.scnt == NILFS_ST_DONE &&
                    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
@@ -2159,7 +2094,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
                        nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
-                if (has_sr) {
+                if (mode == SC_LSEG_SR &&
+                    sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
                        err = nilfs_segctor_fill_in_checkpoint(sci);
                        if (unlikely(err))
                                goto failed_to_write;
@@ -2171,11 +2107,12 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                /* Write partial segments */
                err = nilfs_segctor_prepare_write(sci, &failed_page);
                if (err) {
-                        nilfs_abort_logs(&sci->sc_segbufs, failed_page,
+                        nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
-                                         sci->sc_super_root, err);
                        goto failed_to_write;
                }
-                nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+                nilfs_add_checksums_on_logs(&sci->sc_segbufs,
+                                            nilfs->ns_crc_seed);
                err = nilfs_segctor_write(sci, nilfs);
                if (unlikely(err))
@@ -2196,8 +2133,6 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                }
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
-        sci->sc_super_root = NULL;
 out:
        nilfs_segctor_check_out_files(sci, sbi);
        return err;
@@ -2224,9 +2159,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
 {
        spin_lock(&sci->sc_state_lock);
-        if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+        if (!(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
-                sci->sc_timer->expires = jiffies + sci->sc_interval;
+                sci->sc_timer.expires = jiffies + sci->sc_interval;
-                add_timer(sci->sc_timer);
+                add_timer(&sci->sc_timer);
                sci->sc_state |= NILFS_SEGCTOR_COMMIT;
        }
        spin_unlock(&sci->sc_state_lock);
@@ -2431,9 +2366,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
        spin_lock(&sci->sc_state_lock);
        sci->sc_seq_accepted = sci->sc_seq_request;
        spin_unlock(&sci->sc_state_lock);
+        del_timer_sync(&sci->sc_timer);
-        if (sci->sc_timer)
-                del_timer_sync(sci->sc_timer);
 }
 /**
@@ -2459,9 +2392,9 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
                        sci->sc_flush_request &= ~FLUSH_DAT_BIT;
                /* re-enable timer if checkpoint creation was not done */
-                if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+                if ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                    time_before(jiffies, sci->sc_timer->expires))
+                    time_before(jiffies, sci->sc_timer.expires))
-                        add_timer(sci->sc_timer);
+                        add_timer(&sci->sc_timer);
        }
        spin_unlock(&sci->sc_state_lock);
 }
@@ -2640,13 +2573,10 @@ static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct timer_list timer;
        int timeout = 0;
-        init_timer(&timer);
+        sci->sc_timer.data = (unsigned long)current;
-        timer.data = (unsigned long)current;
+        sci->sc_timer.function = nilfs_construction_timeout;
-        timer.function = nilfs_construction_timeout;
-        sci->sc_timer = &timer;
        /* start sync. */
        sci->sc_task = current;
@@ -2695,7 +2625,7 @@ static int nilfs_segctor_thread(void *arg)
                        should_sleep = 0;
                else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
                        should_sleep = time_before(jiffies,
-                                                   sci->sc_timer->expires);
+                                        sci->sc_timer.expires);
                if (should_sleep) {
                        spin_unlock(&sci->sc_state_lock);
@@ -2704,7 +2634,7 @@ static int nilfs_segctor_thread(void *arg)
                }
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
-                           time_after_eq(jiffies, sci->sc_timer->expires));
+                           time_after_eq(jiffies, sci->sc_timer.expires));
                if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
                        set_nilfs_discontinued(nilfs);
@@ -2713,8 +2643,6 @@ static int nilfs_segctor_thread(void *arg)
 end_thread:
        spin_unlock(&sci->sc_state_lock);
-        del_timer_sync(sci->sc_timer);
-        sci->sc_timer = NULL;
        /* end sync. */
        sci->sc_task = NULL;
@@ -2750,13 +2678,6 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
        }
 }
-static int nilfs_segctor_init(struct nilfs_sc_info *sci)
-{
-        sci->sc_seq_done = sci->sc_seq_request;
-        return nilfs_segctor_start_thread(sci);
-}
 /*
 * Setup & clean-up functions
 */
@@ -2780,6 +2701,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
+        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
@@ -2846,6 +2768,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        del_timer_sync(&sci->sc_timer);
        kfree(sci);
 }
@@ -2880,7 +2803,7 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
                return -ENOMEM;
        nilfs_attach_writer(nilfs, sbi);
-        err = nilfs_segctor_init(NILFS_SC(sbi));
+        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
        if (err) {
                nilfs_detach_writer(nilfs, sbi);
                kfree(sbi->s_sc_info);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 82dfd6a686b9..dca142361ccf 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -100,7 +100,6 @@ struct nilfs_segsum_pointer {
 * @sc_write_logs: List of segment buffers to hold logs under writing
 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
 * @sc_curseg: Current segment buffer
- * @sc_super_root: Pointer to the super root buffer
 * @sc_stage: Collection stage
 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
@@ -148,7 +147,6 @@ struct nilfs_sc_info {
        struct list_head        sc_write_logs;
        unsigned long           sc_segbuf_nblocks;
        struct nilfs_segment_buffer *sc_curseg;
-        struct buffer_head     *sc_super_root;
        struct nilfs_cstage     sc_stage;
@@ -179,7 +177,7 @@ struct nilfs_sc_info {
        unsigned long           sc_lseg_stime;  /* in 1/HZ seconds */
        unsigned long           sc_watermark;
-        struct timer_list      *sc_timer;
+        struct timer_list       sc_timer;
        struct task_struct     *sc_task;
 };
@@ -219,6 +217,8 @@ enum {
 */
 #define NILFS_SC_DEFAULT_WATERMARK  3600
+/* super.c */
+extern struct kmem_cache *nilfs_transaction_cachep;
 /* segment.c */
 extern int nilfs_init_transaction_cache(void);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 48145f505a6a..03b34b738993 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,6 +67,11 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+struct kmem_cache *nilfs_inode_cachep;
+struct kmem_cache *nilfs_transaction_cachep;
+struct kmem_cache *nilfs_segbuf_cachep;
+struct kmem_cache *nilfs_btree_path_cache;
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 /**
@@ -129,7 +134,6 @@ void nilfs_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
-static struct kmem_cache *nilfs_inode_cachep;
 struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
@@ -155,34 +159,6 @@ void nilfs_destroy_inode(struct inode *inode)
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
 }
-static void init_once(void *obj)
-{
-        struct nilfs_inode_info *ii = obj;
-        INIT_LIST_HEAD(&ii->i_dirty);
-#ifdef CONFIG_NILFS_XATTR
-        init_rwsem(&ii->xattr_sem);
-#endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
-        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
-        inode_init_once(&ii->vfs_inode);
-}
-static int nilfs_init_inode_cache(void)
-{
-        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
-                                               sizeof(struct nilfs_inode_info),
-                                               0, SLAB_RECLAIM_ACCOUNT,
-                                               init_once);
-        return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
-}
-static inline void nilfs_destroy_inode_cache(void)
-{
-        kmem_cache_destroy(nilfs_inode_cachep);
-}
 static void nilfs_clear_inode(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -266,8 +242,8 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
        int err;
        /* nilfs->sem must be locked by the caller. */
-        if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+        if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
-                if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+                if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC))
                        nilfs_swap_super_block(nilfs);
                else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
@@ -470,10 +446,10 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (nilfs_test_opt(sbi, SNAPSHOT))
                seq_printf(seq, ",cp=%llu",
                           (unsigned long long int)sbi->s_snapshot_cno);
-        if (nilfs_test_opt(sbi, ERRORS_RO))
-                seq_printf(seq, ",errors=remount-ro");
        if (nilfs_test_opt(sbi, ERRORS_PANIC))
                seq_printf(seq, ",errors=panic");
+        if (nilfs_test_opt(sbi, ERRORS_CONT))
+                seq_printf(seq, ",errors=continue");
        if (nilfs_test_opt(sbi, STRICT_ORDER))
                seq_printf(seq, ",order=strict");
        if (nilfs_test_opt(sbi, NORECOVERY))
@@ -631,7 +607,7 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
                          struct nilfs_super_block *sbp)
 {
        sbi->s_mount_opt =
-                NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 static int nilfs_setup_super(struct nilfs_sb_info *sbi)
@@ -778,9 +754,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                                goto failed_sbi;
                        }
                        cno = sbi->s_snapshot_cno;
-                } else
+                }
-                        /* Read-only mount */
-                        sbi->s_snapshot_cno = cno;
        }
        err = nilfs_attach_checkpoint(sbi, cno);
@@ -849,7 +823,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        struct nilfs_mount_options old_opts;
-        int err;
+        int was_snapshot, err;
        lock_kernel();
@@ -857,6 +831,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
+        was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
        if (!parse_options(data, sb)) {
                err = -EINVAL;
@@ -864,20 +839,32 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        }
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
-        if ((*flags & MS_RDONLY) &&
+        err = -EINVAL;
-            sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+        if (was_snapshot) {
-                printk(KERN_WARNING "NILFS (device %s): couldn't "
+                if (!(*flags & MS_RDONLY)) {
-                       "remount to a different snapshot.\n",
+                        printk(KERN_ERR "NILFS (device %s): cannot remount "
-                       sb->s_id);
+                               "snapshot read/write.\n",
-                err = -EINVAL;
+                               sb->s_id);
-                goto restore_opts;
+                        goto restore_opts;
+                } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+                        printk(KERN_ERR "NILFS (device %s): cannot "
+                               "remount to a different snapshot.\n",
+                               sb->s_id);
+                        goto restore_opts;
+                }
+        } else {
+                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        printk(KERN_ERR "NILFS (device %s): cannot change "
+                               "a regular mount to a snapshot.\n",
+                               sb->s_id);
+                        goto restore_opts;
+                }
        }
        if (!nilfs_valid_fs(nilfs)) {
                printk(KERN_WARNING "NILFS (device %s): couldn't "
                       "remount because the filesystem is in an "
                       "incomplete recovery state.\n", sb->s_id);
-                err = -EINVAL;
                goto restore_opts;
        }
@@ -888,9 +875,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                nilfs_detach_segment_constructor(sbi);
                sb->s_flags |= MS_RDONLY;
-                sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
-                /* nilfs_set_opt(sbi, SNAPSHOT); */
                /*
                 * Remounting a valid RW partition RDONLY, so set
                 * the RDONLY flag and then mark the partition as valid again.
@@ -909,24 +893,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                if (nilfs->ns_current && nilfs->ns_current != sbi) {
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because an RW-mount exists.\n",
-                               sb->s_id);
-                        err = -EBUSY;
-                        goto restore_opts;
-                }
-                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
-                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because the current RO-mount is not "
-                               "the latest one.\n",
-                               sb->s_id);
-                        err = -EINVAL;
-                        goto restore_opts;
-                }
                sb->s_flags &= ~MS_RDONLY;
-                nilfs_clear_opt(sbi, SNAPSHOT);
-                sbi->s_snapshot_cno = 0;
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
@@ -935,8 +902,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                nilfs->ns_current = sbi;
        }
 out:
        up_write(&nilfs->ns_super_sem);
@@ -1022,10 +987,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 {
        struct nilfs_super_data sd;
        struct super_block *s;
+        fmode_t mode = FMODE_READ;
        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
-        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
        if (IS_ERR(sd.bdev))
                return PTR_ERR(sd.bdev);
@@ -1092,10 +1061,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                /* New superblock instance created */
                s->s_flags = flags;
+                s->s_mode = mode;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
-                err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+                err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0,
+                                       nilfs);
                if (err)
                        goto cancel_new;
@@ -1106,7 +1077,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
-                close_bdev_exclusive(sd.bdev, flags);
+                close_bdev_exclusive(sd.bdev, mode);
        simple_set_mnt(mnt, s);
        return 0;
@@ -1114,7 +1085,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
 failed:
-        close_bdev_exclusive(sd.bdev, flags);
+        close_bdev_exclusive(sd.bdev, mode);
        return err;
@@ -1124,7 +1095,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        put_nilfs(nilfs);
        deactivate_locked_super(s);
        /*
-         * deactivate_super() invokes close_bdev_exclusive().
+         * deactivate_locked_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
         * put_nilfs() needs the block device.
         */
@@ -1139,54 +1110,93 @@ struct file_system_type nilfs_fs_type = {
        .fs_flags = FS_REQUIRES_DEV,
 };
-static int __init init_nilfs_fs(void)
+static void nilfs_inode_init_once(void *obj)
 {
-        int err;
+        struct nilfs_inode_info *ii = obj;
-        err = nilfs_init_inode_cache();
-        if (err)
-                goto failed;
-        err = nilfs_init_transaction_cache();
+        INIT_LIST_HEAD(&ii->i_dirty);
-        if (err)
+#ifdef CONFIG_NILFS_XATTR
-                goto failed_inode_cache;
+        init_rwsem(&ii->xattr_sem);
+#endif
+        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+        inode_init_once(&ii->vfs_inode);
+}
-        err = nilfs_init_segbuf_cache();
+static void nilfs_segbuf_init_once(void *obj)
-        if (err)
+{
-                goto failed_transaction_cache;
+        memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
-        err = nilfs_btree_path_cache_init();
+static void nilfs_destroy_cachep(void)
-        if (err)
+{
-                goto failed_segbuf_cache;
+         if (nilfs_inode_cachep)
+                kmem_cache_destroy(nilfs_inode_cachep);
+         if (nilfs_transaction_cachep)
+                kmem_cache_destroy(nilfs_transaction_cachep);
+         if (nilfs_segbuf_cachep)
+                kmem_cache_destroy(nilfs_segbuf_cachep);
+         if (nilfs_btree_path_cache)
+                kmem_cache_destroy(nilfs_btree_path_cache);
+}
-        err = register_filesystem(&nilfs_fs_type);
+static int __init nilfs_init_cachep(void)
-        if (err)
+{
-                goto failed_btree_path_cache;
+        nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+                        sizeof(struct nilfs_inode_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+        if (!nilfs_inode_cachep)
+                goto fail;
+        nilfs_transaction_cachep = kmem_cache_create("nilfs2_transaction_cache",
+                        sizeof(struct nilfs_transaction_info), 0,
+                        SLAB_RECLAIM_ACCOUNT, NULL);
+        if (!nilfs_transaction_cachep)
+                goto fail;
+        nilfs_segbuf_cachep = kmem_cache_create("nilfs2_segbuf_cache",
+                        sizeof(struct nilfs_segment_buffer), 0,
+                        SLAB_RECLAIM_ACCOUNT, nilfs_segbuf_init_once);
+        if (!nilfs_segbuf_cachep)
+                goto fail;
+        nilfs_btree_path_cache = kmem_cache_create("nilfs2_btree_path_cache",
+                        sizeof(struct nilfs_btree_path) * NILFS_BTREE_LEVEL_MAX,
+                        0, 0, NULL);
+        if (!nilfs_btree_path_cache)
+                goto fail;
        return 0;
- failed_btree_path_cache:
+fail:
-        nilfs_btree_path_cache_destroy();
+        nilfs_destroy_cachep();
+        return -ENOMEM;
+}
+static int __init init_nilfs_fs(void)
+{
+        int err;
- failed_segbuf_cache:
+        err = nilfs_init_cachep();
-        nilfs_destroy_segbuf_cache();
+        if (err)
+                goto fail;
- failed_transaction_cache:
+        err = register_filesystem(&nilfs_fs_type);
-        nilfs_destroy_transaction_cache();
+        if (err)
+                goto free_cachep;
- failed_inode_cache:
+        printk(KERN_INFO "NILFS version 2 loaded\n");
-        nilfs_destroy_inode_cache();
+        return 0;
- failed:
+free_cachep:
+        nilfs_destroy_cachep();
+fail:
        return err;
 }
 static void __exit exit_nilfs_fs(void)
 {
-        nilfs_destroy_segbuf_cache();
+        nilfs_destroy_cachep();
-        nilfs_destroy_transaction_cache();
-        nilfs_destroy_inode_cache();
-        nilfs_btree_path_cache_destroy();
        unregister_filesystem(&nilfs_fs_type);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -486,11 +486,15 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                printk(KERN_WARNING
                       "NILFS warning: unable to read secondary superblock\n");
+        /*
+         * Compare two super blocks and set 1 in swp if the secondary
+         * super block is valid and newer.  Otherwise, set 0 in swp.
+         */
        valid[0] = nilfs_valid_sb(sbp[0]);
        valid[1] = nilfs_valid_sb(sbp[1]);
-        swp = valid[1] &&
+        swp = valid[1] && (!valid[0] ||
-                (!valid[0] ||
+                           le64_to_cpu(sbp[1]->s_last_cno) >
-                 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+                           le64_to_cpu(sbp[0]->s_last_cno));
        if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
                brelse(sbh[1]);
@@ -670,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                                                   start * sects_per_block,
                                                   nblocks * sects_per_block,
                                                   GFP_NOFS,
-                                                   DISCARD_FL_BARRIER);
+                                                   BLKDEV_IFL_BARRIER);
                        if (ret < 0)
                                return ret;
                        nblocks = 0;
@@ -680,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
                ret = blkdev_issue_discard(nilfs->ns_bdev,
                                           start * sects_per_block,
                                           nblocks * sects_per_block,
-                                           GFP_NOFS, DISCARD_FL_BARRIER);
+                                           GFP_NOFS, BLKDEV_IFL_BARRIER);
        return ret;
 }
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
 int pin_inotify_watch(struct inotify_watch *watch)
 {
        struct super_block *sb = watch->inode->i_sb;
-        spin_lock(&sb_lock);
+        if (atomic_inc_not_zero(&sb->s_active)) {
-        if (sb->s_count >= S_BIAS) {
-                atomic_inc(&sb->s_active);
-                spin_unlock(&sb_lock);
                atomic_inc(&watch->count);
                return 1;
        }
-        spin_unlock(&sb_lock);
        return 0;
 }
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
 * done.  Cleanup is just deactivate_super().  However, that leaves a messy
 * case - what if we *are* racing with umount() and active references to
 * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
+ * ->s_umount, which will wait until the superblock is shut down and the
- * down and the watch in question is pining for fjords.  That's fine, but
+ * watch in question is pining for fjords.
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount.  We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes().  So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
- * could grab ->s_umount.  So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer.  If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address.  That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that.  Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check.  If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
 *
 * And yes, this is far beyond mere "not very pretty"; so's the entire
 * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
 * Called with ih->mutex held, drops it.  Possible return values:
 * 0 - nothing to do, it has died
 * 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
 */
 static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
 {
        struct super_block *sb = watch->inode->i_sb;
-        s32 wd = watch->wd;
-        spin_lock(&sb_lock);
+        if (atomic_inc_not_zero(&sb->s_active)) {
-        if (sb->s_count >= S_BIAS) {
-                atomic_inc(&sb->s_active);
-                spin_unlock(&sb_lock);
                get_inotify_watch(watch);
                mutex_unlock(&ih->mutex);
                return 1;       /* the best outcome */
        }
+        spin_lock(&sb_lock);
        sb->s_count++;
        spin_unlock(&sb_lock);
        mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
        down_read(&sb->s_umount);
-        if (likely(!sb->s_root)) {
+        /* fs is already shut down; the watch is dead */
-                /* fs is already shut down; the watch is dead */
+        drop_super(sb);
-                drop_super(sb);
+        return 0;
-                return 0;
-        }
-        /* raced with the final deactivate_super() */
-        mutex_lock(&ih->mutex);
-        if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
-                /* the watch is dead */
-                mutex_unlock(&ih->mutex);
-                drop_super(sb);
-                return 0;
-        }
-        /* still alive or freed and reused with the same sb and wd; kill */
-        get_inotify_watch(watch);
-        mutex_unlock(&ih->mutex);
-        return 2;
 }
-static void unpin_and_kill(struct inotify_watch *watch, int how)
+static void unpin_and_kill(struct inotify_watch *watch)
 {
        struct super_block *sb = watch->inode->i_sb;
        put_inotify_watch(watch);
-        switch (how) {
+        deactivate_super(sb);
-        case 1:
-                deactivate_super(sb);
-                break;
-        case 2:
-                drop_super(sb);
-        }
 }
 /**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
                struct list_head *watches;
                struct super_block *sb;
                struct inode *inode;
-                int how;
                mutex_lock(&ih->mutex);
                watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
                }
                watch = list_first_entry(watches, struct inotify_watch, h_list);
                sb = watch->inode->i_sb;
-                how = pin_to_kill(ih, watch);
+                if (!pin_to_kill(ih, watch))
-                if (!how)
                        continue;
                inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
                mutex_unlock(&ih->mutex);
                mutex_unlock(&inode->inotify_mutex);
-                unpin_and_kill(watch, how);
+                unpin_and_kill(watch);
        }
        /* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
        struct inotify_watch *watch;
        struct super_block *sb;
        struct inode *inode;
-        int how;
        mutex_lock(&ih->mutex);
        watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
                return -EINVAL;
        }
        sb = watch->inode->i_sb;
-        how = pin_to_kill(ih, watch);
+        if (!pin_to_kill(ih, watch))
-        if (!how)
                return 0;
        inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
        mutex_unlock(&ih->mutex);
        mutex_unlock(&inode->inotify_mutex);
-        unpin_and_kill(watch, how);
+        unpin_and_kill(watch);
        return 0;
 }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..a1924a0d2ab0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * the page at all.  For a more detailed explanation see ntfs_truncate() in
 * fs/ntfs/inode.c.
 *
- * @cached_page and @lru_pvec are just optimizations for dealing with multiple
- * pages.
- *
 * Return 0 on success and -errno on error.  In the case that an error is
 * encountered it is possible that the initialized size will already have been
 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
 *          held by the caller.
 */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-                struct page **cached_page, struct pagevec *lru_pvec)
 {
        s64 old_init_size;
        loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
- * If a page is newly created, increment its refcount and add it to the
+ * If a page is newly created, add it to lru list
- * caller's lru-buffering pagevec @lru_pvec.
- *
- * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
- * are obtained at once instead of just one page and that 0 is returned on
- * success and -errno on error.
 *
 * Note, the page locks are obtained in ascending page index order.
 */
 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                pgoff_t index, const unsigned nr_pages, struct page **pages,
-                struct page **cached_page, struct pagevec *lru_pvec)
+                struct page **cached_page)
 {
        int err, nr;
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                        goto err_out;
                                }
                        }
-                        err = add_to_page_cache(*cached_page, mapping, index,
+                        err = add_to_page_cache_lru(*cached_page, mapping, index,
                                        GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                goto err_out;
                        }
                        pages[nr] = *cached_page;
-                        page_cache_get(*cached_page);
-                        if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-                                __pagevec_lru_add_file(lru_pvec);
                        *cached_page = NULL;
                }
                index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ssize_t status, written;
        unsigned nr_pages;
        int err;
-        struct pagevec lru_pvec;
        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
                        "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        }
                }
        }
-        pagevec_init(&lru_pvec, 0);
        written = 0;
        /*
         * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ll = ni->initialized_size;
        read_unlock_irqrestore(&ni->size_lock, flags);
        if (pos > ll) {
-                err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
+                err = ntfs_attr_extend_initialized(ni, pos);
-                                &lru_pvec);
                if (err < 0) {
                        ntfs_error(vol->sb, "Cannot perform write to inode "
                                        "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-                                pages, &cached_page, &lru_pvec);
+                                pages, &cached_page);
                if (unlikely(status))
                        break;
                /*
@@ -2077,7 +2062,6 @@ err_out:
        *ppos = pos;
        if (cached_page)
                page_cache_release(cached_page);
-        pagevec_lru_add_file(&lru_pvec);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
                        written ? "written" : "status", (unsigned long)written,
                        (long)status);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 791c0886c060..07d9fd854350 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -29,6 +29,7 @@ ocfs2-objs := \
        mmap.o                  \
        namei.o                 \
        refcounttree.o          \
+        reservations.o          \
        resize.o                \
        slot_map.o              \
        suballoc.o              \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcdc..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -489,7 +489,7 @@ cleanup:
        return ret;
 }
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
+const struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .list   = ocfs2_xattr_list_acl_access,
@@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .set    = ocfs2_xattr_set_acl,
 };
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
+const struct xattr_handler ocfs2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .list   = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9f8bd913c51e..215e12ce1d85 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1006,7 +1006,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        int count, status, i;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_super *osb =
                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
        struct ocfs2_extent_block *eb;
@@ -1015,10 +1015,10 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
        count = 0;
        while (count < wanted) {
-                status = ocfs2_claim_metadata(osb,
+                status = ocfs2_claim_metadata(handle,
-                                              handle,
                                              meta_ac,
                                              wanted - count,
+                                              &suballoc_loc,
                                              &suballoc_bit_start,
                                              &num_got,
                                              &first_blkno);
@@ -1052,6 +1052,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
                        eb->h_suballoc_slot =
                                cpu_to_le16(meta_ac->ac_alloc_slot);
+                        eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
                        eb->h_list.l_count =
                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -1061,11 +1062,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                        /* We'll also be dirtied by the caller, so
                         * this isn't absolutely necessary. */
-                        status = ocfs2_journal_dirty(handle, bhs[i]);
+                        ocfs2_journal_dirty(handle, bhs[i]);
-                        if (status < 0) {
-                                mlog_errno(status);
-                                goto bail;
-                        }
                }
                count += num_got;
@@ -1129,8 +1126,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
                goto out;
        }
-        status = ocfs2_extend_trans(handle, path_num_items(path) +
+        status = ocfs2_extend_trans(handle, path_num_items(path));
-                                    handle->h_buffer_credits);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1270,12 +1266,7 @@ static int ocfs2_add_branch(handle_t *handle,
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                next_blkno = le64_to_cpu(eb->h_blkno);
        }
@@ -1321,17 +1312,10 @@ static int ocfs2_add_branch(handle_t *handle,
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
-        status = ocfs2_journal_dirty(handle, *last_eb_bh);
+        ocfs2_journal_dirty(handle, *last_eb_bh);
-        if (status < 0)
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-                mlog_errno(status);
+        if (eb_bh)
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+                ocfs2_journal_dirty(handle, eb_bh);
-        if (status < 0)
-                mlog_errno(status);
-        if (eb_bh) {
-                status = ocfs2_journal_dirty(handle, eb_bh);
-                if (status < 0)
-                        mlog_errno(status);
-        }
        /*
         * Some callers want to track the rightmost leaf so pass it
@@ -1399,11 +1383,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
                eb_el->l_recs[i] = root_el->l_recs[i];
-        status = ocfs2_journal_dirty(handle, new_eb_bh);
+        ocfs2_journal_dirty(handle, new_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_et_root_journal_access(handle, et,
                                              OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1428,11 +1408,7 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        if (root_el->l_tree_depth == cpu_to_le16(1))
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        *ret_new_eb_bh = new_eb_bh;
        new_eb_bh = NULL;
@@ -2064,7 +2040,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                                       struct ocfs2_path *right_path,
                                       int subtree_index)
 {
-        int ret, i, idx;
+        int i, idx;
        struct ocfs2_extent_list *el, *left_el, *right_el;
        struct ocfs2_extent_rec *left_rec, *right_rec;
        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
@@ -2102,13 +2078,8 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
                ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
                                              right_el);
-                ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+                ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
-                if (ret)
+                ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                        mlog_errno(ret);
-                ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * Setup our list pointers now so that the current
@@ -2132,9 +2103,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
        root_bh = left_path->p_node[subtree_index].bh;
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret)
-                mlog_errno(ret);
 }
 static int ocfs2_rotate_subtree_right(handle_t *handle,
@@ -2207,11 +2176,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        ocfs2_create_empty_extent(right_el);
-        ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+        ocfs2_journal_dirty(handle, right_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /* Do the copy now. */
        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
@@ -2230,11 +2195,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
        le16_add_cpu(&left_el->l_next_free_rec, 1);
-        ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+        ocfs2_journal_dirty(handle, left_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        ocfs2_complete_edge_insert(handle, left_path, right_path,
                                   subtree_index);
@@ -2249,8 +2210,8 @@ out:
 *
 * Will return zero if the path passed in is already the leftmost path.
 */
-static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
-                                         struct ocfs2_path *path, u32 *cpos)
+                                  struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -2327,20 +2288,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
                                           int op_credits,
                                           struct ocfs2_path *path)
 {
-        int ret;
+        int ret = 0;
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
-        if (handle->h_buffer_credits < credits) {
+        if (handle->h_buffer_credits < credits)
                ret = ocfs2_extend_trans(handle,
                                         credits - handle->h_buffer_credits);
-                if (ret)
-                        return ret;
-                if (unlikely(handle->h_buffer_credits < credits))
+        return ret;
-                        return ocfs2_extend_trans(handle, credits);
-        }
-        return 0;
 }
 /*
@@ -2584,8 +2539,7 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
         * records for all the bh in the path.
         * So we have to allocate extra credits and access them.
         */
-        ret = ocfs2_extend_trans(handle,
+        ret = ocfs2_extend_trans(handle, subtree_index);
-                                 handle->h_buffer_credits + subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2823,12 +2777,8 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                ocfs2_remove_empty_extent(right_leaf_el);
        }
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-        if (ret)
+        ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                mlog_errno(ret);
-        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-        if (ret)
-                mlog_errno(ret);
        if (del_right_subtree) {
                ocfs2_unlink_subtree(handle, et, left_path, right_path,
@@ -2851,9 +2801,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
                if (right_has_empty)
                        ocfs2_remove_empty_extent(left_leaf_el);
-                ret = ocfs2_journal_dirty(handle, et_root_bh);
+                ocfs2_journal_dirty(handle, et_root_bh);
-                if (ret)
-                        mlog_errno(ret);
                *deleted = 1;
        } else
@@ -2962,10 +2910,7 @@ static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
        }
        ocfs2_remove_empty_extent(el);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
@@ -3506,15 +3451,9 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (right_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
-                if (ret)
-                        mlog_errno(ret);
                ocfs2_complete_edge_insert(handle, left_path, right_path,
                                           subtree_index);
        }
@@ -3683,14 +3622,9 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
        ocfs2_cleanup_merge(el, index);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
-                ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+                ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
-                if (ret)
-                        mlog_errno(ret);
                /*
                 * In the situation that the right_rec is empty and the extent
@@ -4016,10 +3950,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                le32_add_cpu(&rec->e_int_clusters,
                             -le32_to_cpu(rec->e_cpos));
-                ret = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (ret)
-                        mlog_errno(ret);
        }
 }
@@ -4203,17 +4134,13 @@ static int ocfs2_insert_path(handle_t *handle,
        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
        if (left_path) {
-                int credits = handle->h_buffer_credits;
                /*
                 * There's a chance that left_path got passed back to
                 * us without being accounted for in the
                 * journal. Extend our transaction here to be sure we
                 * can change those blocks.
                 */
-                credits += left_path->p_tree_depth;
+                ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
-                ret = ocfs2_extend_trans(handle, credits);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -4251,17 +4178,13 @@ static int ocfs2_insert_path(handle_t *handle,
                 * dirty this for us.
                 */
                if (left_path)
-                        ret = ocfs2_journal_dirty(handle,
+                        ocfs2_journal_dirty(handle,
-                                                  path_leaf_bh(left_path));
+                                            path_leaf_bh(left_path));
-                        if (ret)
-                                mlog_errno(ret);
        } else
                ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
                                     insert);
-        ret = ocfs2_journal_dirty(handle, leaf_bh);
+        ocfs2_journal_dirty(handle, leaf_bh);
-        if (ret)
-                mlog_errno(ret);
        if (left_path) {
                /*
@@ -4384,9 +4307,7 @@ out_update_clusters:
                ocfs2_et_update_clusters(et,
                                         le16_to_cpu(insert_rec->e_leaf_clusters));
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        ocfs2_free_path(left_path);
@@ -4866,7 +4787,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        status = __ocfs2_claim_clusters(handle, data_ac, 1,
                                        clusters_to_add, &bit_off, &num_bits);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -4895,11 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= num_bits;
        *logical_offset += num_bits;
@@ -5309,7 +5226,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
                            int index, u32 new_range,
                            struct ocfs2_alloc_context *meta_ac)
 {
-        int ret, depth, credits = handle->h_buffer_credits;
+        int ret, depth, credits;
        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *rightmost_el, *el;
@@ -5340,8 +5257,8 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
        } else
                rightmost_el = path_leaf_el(path);
-        credits += path->p_tree_depth +
+        credits = path->p_tree_depth +
-                   ocfs2_extend_meta_needed(et->et_root_el);
+                  ocfs2_extend_meta_needed(et->et_root_el);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5671,19 +5588,97 @@ out:
        return ret;
 }
+/*
+ * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
+ * same as ocfs2_lock_alloctors(), except for it accepts a blocks
+ * number to reserve some extra blocks, and it only handles meta
+ * data allocations.
+ *
+ * Currently, only ocfs2_remove_btree_range() uses it for truncating
+ * and punching holes.
+ */
+static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
+                                              struct ocfs2_extent_tree *et,
+                                              u32 extents_to_split,
+                                              struct ocfs2_alloc_context **ac,
+                                              int extra_blocks)
+{
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = 2 * extents_to_split;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *ac = NULL;
+        num_free_extents = ocfs2_num_free_extents(osb, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
+                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
+        if (extra_blocks) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+out:
+        if (ret) {
+                if (*ac) {
+                        ocfs2_free_alloc_context(*ac);
+                        *ac = NULL;
+                }
+        }
+        return ret;
+}
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc)
 {
-        int ret;
+        int ret, credits = 0, extra_blocks = 0;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct inode *tl_inode = osb->osb_tl_inode;
        handle_t *handle;
        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_refcount_tree *ref_tree = NULL;
+        if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
+                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
+                         OCFS2_HAS_REFCOUNT_FL));
+                ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+                                               &ref_tree, NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+                ret = ocfs2_prepare_refcount_change_for_del(inode,
+                                                            refcount_loc,
+                                                            phys_blkno,
+                                                            len,
+                                                            &credits,
+                                                            &extra_blocks);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
+                                                 extra_blocks);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -5699,7 +5694,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        handle = ocfs2_start_trans(osb,
+                        ocfs2_remove_extent_credits(osb->sb) + credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
@@ -5724,15 +5720,22 @@ int ocfs2_remove_btree_range(struct inode *inode,
        ocfs2_et_update_clusters(et, -len);
-        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        ocfs2_journal_dirty(handle, et->et_root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (phys_blkno) {
-        if (ret)
+                if (flags & OCFS2_EXT_REFCOUNTED)
-                mlog_errno(ret);
+                        ret = ocfs2_decrease_refcount(inode, handle,
+                                        ocfs2_blocks_to_clusters(osb->sb,
+                                                                 phys_blkno),
+                                        len, meta_ac,
+                                        dealloc, 1);
+                else
+                        ret = ocfs2_truncate_log_append(osb, handle,
+                                                        phys_blkno, len);
+                if (ret)
+                        mlog_errno(ret);
+        }
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -5742,6 +5745,9 @@ out:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        if (ref_tree)
+                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
        return ret;
 }
@@ -5850,11 +5856,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        }
        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
-        status = ocfs2_journal_dirty(handle, tl_bh);
+        ocfs2_journal_dirty(handle, tl_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -5893,11 +5895,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                tl->tl_used = cpu_to_le16(i);
-                status = ocfs2_journal_dirty(handle, tl_bh);
+                ocfs2_journal_dirty(handle, tl_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                /* TODO: Perhaps we can calculate the bulk of the
                 * credits up front rather than extending like
@@ -6298,6 +6296,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
+        u64                                     free_bg;
        u64                                     free_blk;
        unsigned int                            free_bit;
 };
@@ -6344,8 +6343,11 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
        }
        while (head) {
-                bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                if (head->free_bg)
-                                                      head->free_bit);
+                        bg_blkno = head->free_bg;
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+                                                              head->free_bit);
                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
                     head->free_bit, (unsigned long long)head->free_blk);
@@ -6393,7 +6395,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        int ret = 0;
        struct ocfs2_cached_block_free *item;
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6533,8 +6535,8 @@ ocfs2_find_per_slot_free_list(int type,
 }
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc,
-                              unsigned int bit)
+                              u64 blkno, unsigned int bit)
 {
        int ret;
        struct ocfs2_per_slot_free_list *fl;
@@ -6547,7 +6549,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                goto out;
        }
-        item = kmalloc(sizeof(*item), GFP_NOFS);
+        item = kzalloc(sizeof(*item), GFP_NOFS);
        if (item == NULL) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -6557,6 +6559,7 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
             type, slot, bit, (unsigned long long)blkno);
+        item->free_bg = suballoc;
        item->free_blk = blkno;
        item->free_bit = bit;
        item->free_next = fl->f_first;
@@ -6573,433 +6576,11 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 {
        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
                                         le16_to_cpu(eb->h_suballoc_slot),
+                                         le64_to_cpu(eb->h_suballoc_loc),
                                         le64_to_cpu(eb->h_blkno),
                                         le16_to_cpu(eb->h_suballoc_bit));
 }
-/* This function will figure out whether the currently last extent
- * block will be deleted, and if it will, what the new last extent
- * block will be so we can update his h_next_leaf_blk field, as well
- * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                       unsigned int clusters_to_del,
-                                       struct ocfs2_path *path,
-                                       struct buffer_head **new_last_eb)
-{
-        int next_free, ret = 0;
-        u32 cpos;
-        struct ocfs2_extent_rec *rec;
-        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *bh = NULL;
-        *new_last_eb = NULL;
-        /* we have no tree, so of course, no last_eb. */
-        if (!path->p_tree_depth)
-                goto out;
-        /* trunc to zero special case - this makes tree_depth = 0
-         * regardless of what it is.  */
-        if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
-                goto out;
-        el = path_leaf_el(path);
-        BUG_ON(!el->l_next_free_rec);
-        /*
-         * Make sure that this extent list will actually be empty
-         * after we clear away the data. We can shortcut out if
-         * there's more than one non-empty extent in the
-         * list. Otherwise, a check of the remaining extent is
-         * necessary.
-         */
-        next_free = le16_to_cpu(el->l_next_free_rec);
-        rec = NULL;
-        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                if (next_free > 2)
-                        goto out;
-                /* We may have a valid extent in index 1, check it. */
-                if (next_free == 2)
-                        rec = &el->l_recs[1];
-                /*
-                 * Fall through - no more nonempty extents, so we want
-                 * to delete this leaf.
-                 */
-        } else {
-                if (next_free > 1)
-                        goto out;
-                rec = &el->l_recs[0];
-        }
-        if (rec) {
-                /*
-                 * Check it we'll only be trimming off the end of this
-                 * cluster.
-                 */
-                if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
-                        goto out;
-        }
-        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        eb = (struct ocfs2_extent_block *) bh->b_data;
-        el = &eb->h_list;
-        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-         * Any corruption is a code bug. */
-        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        *new_last_eb = bh;
-        get_bh(*new_last_eb);
-        mlog(0, "returning block %llu, (cpos: %u)\n",
-             (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
-out:
-        brelse(bh);
-        return ret;
-}
-/*
- * Trim some clusters off the rightmost edge of a tree. Only called
- * during truncate.
- *
- * The caller needs to:
- *   - start journaling of each path component.
- *   - compute and fully set up any new last ext block
- */
-static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
-                           handle_t *handle, struct ocfs2_truncate_context *tc,
-                           u32 clusters_to_del, u64 *delete_start, u8 *flags)
-{
-        int ret, i, index = path->p_tree_depth;
-        u32 new_edge = 0;
-        u64 deleted_eb = 0;
-        struct buffer_head *bh;
-        struct ocfs2_extent_list *el;
-        struct ocfs2_extent_rec *rec;
-        *delete_start = 0;
-        *flags = 0;
-        while (index >= 0) {
-                bh = path->p_node[index].bh;
-                el = path->p_node[index].el;
-                mlog(0, "traveling tree (index = %d, block = %llu)\n",
-                     index,  (unsigned long long)bh->b_blocknr);
-                BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-                if (index !=
-                    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-                        ocfs2_error(inode->i_sb,
-                                    "Inode %lu has invalid ext. block %llu",
-                                    inode->i_ino,
-                                    (unsigned long long)bh->b_blocknr);
-                        ret = -EROFS;
-                        goto out;
-                }
-find_tail_record:
-                i = le16_to_cpu(el->l_next_free_rec) - 1;
-                rec = &el->l_recs[i];
-                mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
-                     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
-                     ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
-                if (le16_to_cpu(el->l_tree_depth) == 0) {
-                        /*
-                         * If the leaf block contains a single empty
-                         * extent and no records, we can just remove
-                         * the block.
-                         */
-                        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                el->l_next_free_rec = cpu_to_le16(0);
-                                goto delete;
-                        }
-                        /*
-                         * Remove any empty extents by shifting things
-                         * left. That should make life much easier on
-                         * the code below. This condition is rare
-                         * enough that we shouldn't see a performance
-                         * hit.
-                         */
-                        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                for(i = 0;
-                                    i < le16_to_cpu(el->l_next_free_rec); i++)
-                                        el->l_recs[i] = el->l_recs[i + 1];
-                                memset(&el->l_recs[i], 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                /*
-                                 * We've modified our extent list. The
-                                 * simplest way to handle this change
-                                 * is to being the search from the
-                                 * start again.
-                                 */
-                                goto find_tail_record;
-                        }
-                        le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
-                        /*
-                         * We'll use "new_edge" on our way back up the
-                         * tree to know what our rightmost cpos is.
-                         */
-                        new_edge = le16_to_cpu(rec->e_leaf_clusters);
-                        new_edge += le32_to_cpu(rec->e_cpos);
-                        /*
-                         * The caller will use this to delete data blocks.
-                         */
-                        *delete_start = le64_to_cpu(rec->e_blkno)
-                                + ocfs2_clusters_to_blocks(inode->i_sb,
-                                        le16_to_cpu(rec->e_leaf_clusters));
-                        *flags = rec->e_flags;
-                        /*
-                         * If it's now empty, remove this record.
-                         */
-                        if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                        }
-                } else {
-                        if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
-                                memset(rec, 0,
-                                       sizeof(struct ocfs2_extent_rec));
-                                le16_add_cpu(&el->l_next_free_rec, -1);
-                                goto delete;
-                        }
-                        /* Can this actually happen? */
-                        if (le16_to_cpu(el->l_next_free_rec) == 0)
-                                goto delete;
-                        /*
-                         * We never actually deleted any clusters
-                         * because our leaf was empty. There's no
-                         * reason to adjust the rightmost edge then.
-                         */
-                        if (new_edge == 0)
-                                goto delete;
-                        rec->e_int_clusters = cpu_to_le32(new_edge);
-                        le32_add_cpu(&rec->e_int_clusters,
-                                     -le32_to_cpu(rec->e_cpos));
-                         /*
-                          * A deleted child record should have been
-                          * caught above.
-                          */
-                         BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
-                }
-delete:
-                ret = ocfs2_journal_dirty(handle, bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-                mlog(0, "extent list container %llu, after: record %d: "
-                     "(%u, %u, %llu), next = %u.\n",
-                     (unsigned long long)bh->b_blocknr, i,
-                     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
-                     (unsigned long long)le64_to_cpu(rec->e_blkno),
-                     le16_to_cpu(el->l_next_free_rec));
-                /*
-                 * We must be careful to only attempt delete of an
-                 * extent block (and not the root inode block).
-                 */
-                if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
-                        struct ocfs2_extent_block *eb =
-                                (struct ocfs2_extent_block *)bh->b_data;
-                        /*
-                         * Save this for use when processing the
-                         * parent block.
-                         */
-                        deleted_eb = le64_to_cpu(eb->h_blkno);
-                        mlog(0, "deleting this extent block.\n");
-                        ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
-                        BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
-                        BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
-                        BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
-                        ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
-                        /* An error here is not fatal. */
-                        if (ret < 0)
-                                mlog_errno(ret);
-                } else {
-                        deleted_eb = 0;
-                }
-                index--;
-        }
-        ret = 0;
-out:
-        return ret;
-}
-static int ocfs2_do_truncate(struct ocfs2_super *osb,
-                             unsigned int clusters_to_del,
-                             struct inode *inode,
-                             struct buffer_head *fe_bh,
-                             handle_t *handle,
-                             struct ocfs2_truncate_context *tc,
-                             struct ocfs2_path *path,
-                             struct ocfs2_alloc_context *meta_ac)
-{
-        int status;
-        struct ocfs2_dinode *fe;
-        struct ocfs2_extent_block *last_eb = NULL;
-        struct ocfs2_extent_list *el;
-        struct buffer_head *last_eb_bh = NULL;
-        u64 delete_blk = 0;
-        u8 rec_flags;
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
-                                             path, &last_eb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        /*
-         * Each component will be touched, so we might as well journal
-         * here to avoid having to handle errors later.
-         */
-        status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb_bh) {
-                status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
-                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-        }
-        el = &(fe->id2.i_list);
-        /*
-         * Lower levels depend on this never happening, but it's best
-         * to check it up here before changing the tree.
-         */
-        if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
-                ocfs2_error(inode->i_sb,
-                            "Inode %lu has an empty extent record, depth %u\n",
-                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
-                status = -EROFS;
-                goto bail;
-        }
-        dquot_free_space_nodirty(inode,
-                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
-                                      clusters_to_del;
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-        inode->i_blocks = ocfs2_inode_sector_count(inode);
-        status = ocfs2_trim_tree(inode, path, handle, tc,
-                                 clusters_to_del, &delete_blk, &rec_flags);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (le32_to_cpu(fe->i_clusters) == 0) {
-                /* trunc to zero is a special case. */
-                el->l_tree_depth = 0;
-                fe->i_last_eb_blk = 0;
-        } else if (last_eb)
-                fe->i_last_eb_blk = last_eb->h_blkno;
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        if (last_eb) {
-                /* If there will be a new last extent block, then by
-                 * definition, there cannot be any leaves to the right of
-                 * him. */
-                last_eb->h_next_leaf_blk = 0;
-                status = ocfs2_journal_dirty(handle, last_eb_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        if (delete_blk) {
-                if (rec_flags & OCFS2_EXT_REFCOUNTED)
-                        status = ocfs2_decrease_refcount(inode, handle,
-                                        ocfs2_blocks_to_clusters(osb->sb,
-                                                                 delete_blk),
-                                        clusters_to_del, meta_ac,
-                                        &tc->tc_dealloc, 1);
-                else
-                        status = ocfs2_truncate_log_append(osb, handle,
-                                                           delete_blk,
-                                                           clusters_to_del);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        status = 0;
-bail:
-        brelse(last_eb_bh);
-        mlog_exit(status);
-        return status;
-}
 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
        set_buffer_uptodate(bh);
@@ -7307,7 +6888,9 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                        goto out_commit;
                did_quota = 1;
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
                        mlog_errno(ret);
@@ -7406,26 +6989,29 @@ out:
 */
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh)
-                          struct ocfs2_truncate_context *tc)
 {
-        int status, i, credits, tl_sem = 0;
+        int status = 0, i, flags = 0;
-        u32 clusters_to_del, new_highest_cpos, range;
+        u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
        u64 blkno = 0;
        struct ocfs2_extent_list *el;
-        handle_t *handle = NULL;
+        struct ocfs2_extent_rec *rec;
-        struct inode *tl_inode = osb->osb_tl_inode;
        struct ocfs2_path *path = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_extent_list *root_el = &(di->id2.i_list);
-        struct ocfs2_refcount_tree *ref_tree = NULL;
+        u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
+        struct ocfs2_extent_tree et;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
        mlog_entry_void();
+        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+        ocfs2_init_dealloc_ctxt(&dealloc);
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+        path = ocfs2_new_path(di_bh, &di->id2.i_list,
                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
@@ -7444,8 +7030,6 @@ start:
                goto bail;
        }
-        credits = 0;
        /*
         * Truncate always works against the rightmost tree branch.
         */
@@ -7480,101 +7064,62 @@ start:
        }
        i = le16_to_cpu(el->l_next_free_rec) - 1;
-        range = le32_to_cpu(el->l_recs[i].e_cpos) +
+        rec = &el->l_recs[i];
-                ocfs2_rec_clusters(el, &el->l_recs[i]);
+        flags = rec->e_flags;
-        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
-                clusters_to_del = 0;
-        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+        if (i == 0 && ocfs2_is_empty_extent(rec)) {
-                clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+                /*
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+                 * Lower levels depend on this never happening, but it's best
+                 * to check it up here before changing the tree.
+                */
+                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
+                        ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+                                    "extent record, depth %u\n", inode->i_ino,
+                                    le16_to_cpu(root_el->l_tree_depth));
+                        status = -EROFS;
+                        goto bail;
+                }
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = 0;
+                blkno = 0;
+        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
+                /*
+                 * Truncate entire record.
+                 */
+                trunc_cpos = le32_to_cpu(rec->e_cpos);
+                trunc_len = ocfs2_rec_clusters(el, rec);
+                blkno = le64_to_cpu(rec->e_blkno);
        } else if (range > new_highest_cpos) {
-                clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
+                /*
-                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
+                 * Partial truncate. it also should be
-                                  new_highest_cpos;
+                 * the last truncate we're doing.
-                blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+                 */
-                        ocfs2_clusters_to_blocks(inode->i_sb,
+                trunc_cpos = new_highest_cpos;
-                                ocfs2_rec_clusters(el, &el->l_recs[i]) -
+                trunc_len = range - new_highest_cpos;
-                                clusters_to_del);
+                coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
+                blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
        } else {
+                /*
+                 * Truncate completed, leave happily.
+                 */
                status = 0;
                goto bail;
        }
-        mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+        phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
-             clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
-        if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
-                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-                         OCFS2_HAS_REFCOUNT_FL));
-                status = ocfs2_lock_refcount_tree(osb,
-                                                le64_to_cpu(di->i_refcount_loc),
-                                                1, &ref_tree, NULL);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-                status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
-                                                               blkno,
-                                                               clusters_to_del,
-                                                               &credits,
-                                                               &meta_ac);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        tl_sem = 1;
-        /* ocfs2_truncate_log_needs_flush guarantees us at least one
-         * record is free for use. If there isn't any, we flush to get
-         * an empty truncate log.  */
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                status = __ocfs2_flush_truncate_log(osb);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
-        }
-        credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+        status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
-                                                (struct ocfs2_dinode *)fe_bh->b_data,
+                                          phys_cpos, trunc_len, flags, &dealloc,
-                                                el);
+                                          refcount_loc);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto bail;
-        }
-        status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-                                   tc, path, meta_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mutex_unlock(&tl_inode->i_mutex);
-        tl_sem = 0;
-        ocfs2_commit_trans(osb, handle);
-        handle = NULL;
        ocfs2_reinit_path(path, 1);
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if (ref_tree) {
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-                ref_tree = NULL;
-        }
        /*
         * The check above will catch the case where we've truncated
         * away all allocation.
@@ -7585,25 +7130,10 @@ bail:
        ocfs2_schedule_truncate_log_flush(osb, 1);
-        if (tl_sem)
+        ocfs2_run_deallocs(osb, &dealloc);
-                mutex_unlock(&tl_inode->i_mutex);
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        if (ref_tree)
-                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
-        ocfs2_run_deallocs(osb, &tc->tc_dealloc);
        ocfs2_free_path(path);
-        /* This will drop the ext_alloc cluster lock for us */
-        ocfs2_free_truncate_context(tc);
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 1db4359ccb90..55762b554b99 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -140,8 +140,9 @@ int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_remove_btree_range(struct inode *inode,
                             struct ocfs2_extent_tree *et,
-                             u32 cpos, u32 phys_cpos, u32 len,
+                             u32 cpos, u32 phys_cpos, u32 len, int flags,
-                             struct ocfs2_cached_dealloc_ctxt *dealloc);
+                             struct ocfs2_cached_dealloc_ctxt *dealloc,
+                             u64 refcount_loc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct ocfs2_extent_tree *et);
@@ -209,7 +210,7 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                u64 blkno, unsigned int bit);
 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
-                              int type, int slot, u64 blkno,
+                              int type, int slot, u64 suballoc, u64 blkno,
                              unsigned int bit);
 static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
 {
@@ -233,8 +234,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
                           struct ocfs2_truncate_context **tc);
 int ocfs2_commit_truncate(struct ocfs2_super *osb,
                          struct inode *inode,
-                          struct buffer_head *fe_bh,
+                          struct buffer_head *di_bh);
-                          struct ocfs2_truncate_context *tc);
 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                          unsigned int start, unsigned int end, int trunc);
@@ -319,6 +319,8 @@ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              struct ocfs2_path *path);
 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+                                  struct ocfs2_path *path, u32 *cpos);
 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
                            struct ocfs2_path *left,
                            struct ocfs2_path *right);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 21441ddb5506..3623ca20cc18 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1735,6 +1735,9 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                        goto out;
                }
+                if (data_ac)
+                        data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                    &di->id2.i_list,
                                                    clusters_to_alloc);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 3bb928a2bf7d..c7fba396392d 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -116,6 +116,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
+        define_mask(RESERVATIONS),
 };
 static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 3dfddbec32f2..fd96e2a2fa56 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,6 +119,7 @@
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
 #define ML_KTHREAD      0x0000000400000000ULL /* kernel thread activity */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
 #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 73e743eea2c8..aa75ca3f78da 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -583,6 +583,9 @@ static void o2net_state_change(struct sock *sk)
                        o2net_sc_queue_work(sc, &sc->sc_connect_work);
                        break;
                default:
+                        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                              " shutdown, state %d\n",
+                              SC_NODEF_ARGS(sc), sk->sk_state);
                        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
                        break;
        }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index efd77d071c80..f04ebcfffc4a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1194,7 +1194,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
                        goto bail;
                }
                i += le16_to_cpu(de->rec_len);
@@ -1752,7 +1752,7 @@ int __ocfs2_add_entry(handle_t *handle,
                                ocfs2_recalc_free_list(dir, handle, lookup);
                        dir->i_version++;
-                        status = ocfs2_journal_dirty(handle, insert_bh);
+                        ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
                        goto bail;
                }
@@ -2297,12 +2297,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        }
        ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        i_size_write(inode, size);
        inode->i_nlink = 2;
@@ -2366,11 +2361,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                ocfs2_init_dir_trailer(inode, new_bh, size);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        i_size_write(inode, inode->i_sb->s_blocksize);
        inode->i_nlink = 2;
@@ -2404,15 +2395,15 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        int ret;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        u16 dr_suballoc_bit;
-        u64 dr_blkno;
+        u64 suballoc_loc, dr_blkno;
        unsigned int num_bits;
        struct buffer_head *dx_root_bh = NULL;
        struct ocfs2_dx_root_block *dx_root;
        struct ocfs2_dir_block_trailer *trailer =
                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
-                                   &num_bits, &dr_blkno);
+                                   &dr_suballoc_bit, &num_bits, &dr_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2440,6 +2431,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        memset(dx_root, 0, osb->sb->s_blocksize);
        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
@@ -2458,10 +2450,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
                dx_root->dr_list.l_count =
                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
        }
+        ocfs2_journal_dirty(handle, dx_root_bh);
-        ret = ocfs2_journal_dirty(handle, dx_root_bh);
-        if (ret)
-                mlog_errno(ret);
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -2475,9 +2464,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret)
-                mlog_errno(ret);
        *ret_dx_root_bh = dx_root_bh;
        dx_root_bh = NULL;
@@ -2558,7 +2545,7 @@ static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
         * chance of contiguousness as the directory grows in number
         * of entries.
         */
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
+        ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2991,7 +2978,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * if we only get one now, that's enough to continue. The rest
         * will be claimed after the conversion to extents.
         */
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        if (ocfs2_dir_resv_allowed(osb))
+                data_ac->ac_resv = &oi->ip_la_data_resv;
+        ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -3034,11 +3023,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
        }
-        ret = ocfs2_journal_dirty(handle, dirdata_bh);
+        ocfs2_journal_dirty(handle, dirdata_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
                /*
@@ -3104,11 +3089,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         */
        dir->i_blocks = ocfs2_inode_sector_count(dir);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        if (ocfs2_supports_indexed_dirs(osb)) {
                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
@@ -3138,7 +3119,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * pass. Claim the 2nd cluster as a separate extent.
         */
        if (alloc > len) {
-                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
                                           &len);
                if (ret) {
                        mlog_errno(ret);
@@ -3369,6 +3350,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                        goto bail;
                }
+                if (ocfs2_dir_resv_allowed(osb))
+                        data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
                credits = ocfs2_calc_extend_credits(sb, el, 1);
        } else {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
@@ -3423,11 +3407,7 @@ do_extend:
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
-        status = ocfs2_journal_dirty(handle, new_bh);
+        ocfs2_journal_dirty(handle, new_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        dir_i_size += dir->i_sb->s_blocksize;
        i_size_write(dir, dir_i_size);
@@ -3906,11 +3886,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
             dx_leaf_sort_swap);
-        ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+        ocfs2_journal_dirty(handle, dx_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
                                           &split_hash);
@@ -4490,7 +4466,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
        blk = le64_to_cpu(dx_root->dr_blkno);
        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (dx_root->dr_suballoc_loc)
+                bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
                                       bit, bg_blkno, 1);
        if (ret)
@@ -4551,8 +4530,8 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
-                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
-                                               &dealloc);
+                                               &dealloc, 0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 12d5eb78a11a..f44999156839 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -88,7 +88,7 @@ static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
        return 0;
 }
-static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -145,7 +145,7 @@ void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 }
-static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
 {
        mlog_entry_void();
@@ -451,7 +451,9 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
                                     lock->ml.node, &status);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_PROXY_AST_MSG, dlm->key,
+                     lock->ml.node);
        else {
                if (status == DLM_RECOVERING) {
                        mlog(ML_ERROR, "sent AST to node %u, it thinks this "
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0102be35980c..4b6ae2c13b47 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,7 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
-#define DLM_HASH_SIZE_DEFAULT   (1 << 14)
+#define DLM_HASH_SIZE_DEFAULT   (1 << 17)
 #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
 # define DLM_HASH_PAGES         1
 #else
@@ -904,6 +904,8 @@ void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
 void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 void dlm_do_local_ast(struct dlm_ctxt *dlm,
                      struct dlm_lock_resource *res,
                      struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 90803b47cd8c..9f30491e5e88 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -390,7 +390,9 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
                } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
                        dlm_error(ret);
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CONVERT_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        /* instead of logging the same network error over
                         * and over, sleep here and wait for the heartbeat
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 988c9055fd4e..6b5a492e1749 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -511,7 +511,7 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
        assert_spin_locked(&dlm->spinlock);
-        printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
+        printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
        while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                     node + 1)) < O2NM_MAX_NODES) {
@@ -534,7 +534,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        node = exit_msg->node_idx;
-        printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
+        printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
        spin_lock(&dlm->spinlock);
        clear_bit(node, dlm->domain_map);
@@ -565,7 +565,9 @@ static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
        status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
                                    &leave_msg, sizeof(leave_msg), node,
                                    NULL);
+        if (status < 0)
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_EXIT_DOMAIN_MSG, dlm->key, node);
        mlog(0, "status return %d from o2net_send_message\n", status);
        return status;
@@ -904,7 +906,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                set_bit(assert->node_idx, dlm->domain_map);
                __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-                printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+                printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
                       assert->node_idx, dlm->name);
                __dlm_print_nodes(dlm);
@@ -962,7 +964,9 @@ static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
                                    &cancel_msg, sizeof(cancel_msg), node,
                                    NULL);
        if (status < 0) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
@@ -1029,10 +1033,11 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
-                                    sizeof(join_msg), node,
+                                    sizeof(join_msg), node, &join_resp);
-                                    &join_resp);
        if (status < 0 && status != -ENOPROTOOPT) {
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
+                     node);
                goto bail;
        }
        dlm_query_join_wire_to_packet(join_resp, &packet);
@@ -1103,7 +1108,9 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
                                    &assert_msg, sizeof(assert_msg), node,
                                    NULL);
        if (status < 0)
-                mlog_errno(status);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
+                     node);
        return status;
 }
@@ -1516,7 +1523,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                goto leave;
        }
-        dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
+        dlm->name = kstrdup(domain, GFP_KERNEL);
        if (dlm->name == NULL) {
                mlog_errno(-ENOMEM);
                kfree(dlm);
@@ -1550,7 +1557,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
-        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 733337772671..69cf369961c4 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -329,7 +329,9 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
                        BUG();
                }
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
+                     res->owner);
                if (dlm_is_host_down(tmpret)) {
                        ret = DLM_RECOVERING;
                        mlog(0, "node %u died so returning DLM_RECOVERING "
@@ -429,7 +431,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
-        lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
+        lock = kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9289b4357d27..4a7506a4e314 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -617,13 +617,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res = NULL;
-        res = (struct dlm_lock_resource *)
+        res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
                goto error;
-        res->lockname.name = (char *)
+        res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
-                                kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
        if (!res->lockname.name)
                goto error;
@@ -757,8 +755,7 @@ lookup:
                spin_unlock(&dlm->spinlock);
                mlog(0, "allocating a new resource\n");
                /* nothing found and we need to allocate one. */
-                alloc_mle = (struct dlm_master_list_entry *)
+                alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                        kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                if (!alloc_mle)
                        goto leave;
                res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1542,8 +1539,7 @@ way_up_top:
                        spin_unlock(&dlm->master_lock);
                        spin_unlock(&dlm->spinlock);
-                        mle = (struct dlm_master_list_entry *)
+                        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
                        if (!mle) {
                                response = DLM_MASTER_RESP_ERROR;
                                mlog_errno(-ENOMEM);
@@ -1666,7 +1662,9 @@ again:
                tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
                                            &assert, sizeof(assert), to, &r);
                if (tmpret < 0) {
-                        mlog(0, "assert_master returned %d!\n", tmpret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", tmpret,
+                             DLM_ASSERT_MASTER_MSG, dlm->key, to);
                        if (!dlm_is_host_down(tmpret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
                                BUG();
@@ -2205,7 +2203,9 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
        ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                 &deref, sizeof(deref), res->owner, &r);
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
+                     res->owner);
        else if (r < 0) {
                /* BAD.  other node says I did not have a ref. */
                mlog(ML_ERROR,"while dropping ref on %s:%.*s "
@@ -2452,8 +2452,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
                goto leave;
        }
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                                GFP_NOFS);
        if (!mle) {
                mlog_errno(ret);
                goto leave;
@@ -2975,7 +2974,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
                if (ret < 0) {
-                        mlog(0, "migrate_request returned %d!\n", ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
+                             dlm->key, nodenum);
                        if (!dlm_is_host_down(ret)) {
                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                BUG();
@@ -3033,8 +3034,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        hash = dlm_lockid_hash(name, namelen);
        /* preallocate.. if this fails, abort */
-        mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
+        mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
-                                                         GFP_NOFS);
        if (!mle) {
                ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b4f99de2caf3..f8b75ce4be70 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -803,7 +803,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
        /* negative status is handled by caller */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
+                     dlm->key, request_from);
        // return from here, then
        // sleep until all received or error
@@ -955,10 +957,10 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
        ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                 sizeof(done_msg), send_to, &tmpret);
        if (ret < 0) {
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
+                     dlm->key, send_to);
                if (!dlm_is_host_down(ret)) {
-                        mlog_errno(ret);
-                        mlog(ML_ERROR, "%s: unknown error sending data-done "
-                             "to %u\n", dlm->name, send_to);
                        BUG();
                }
        } else
@@ -1126,7 +1128,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        if (ret < 0) {
                /* XXX: negative status is not handled.
                 * this will end up killing this node. */
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
+                     dlm->key, send_to);
        } else {
                /* might get an -ENOMEM back here */
                ret = status;
@@ -1642,7 +1646,9 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                                 &req, sizeof(req), nodenum, &status);
        /* XXX: negative status not handled properly here. */
        if (ret < 0)
-                mlog_errno(ret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key "
+                     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
+                     dlm->key, nodenum);
        else {
                BUG_ON(status < 0);
                BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -2640,7 +2646,7 @@ retry:
                if (dlm_is_host_down(ret)) {
                        /* node is down.  not involved in recovery
                         * so just keep going */
-                        mlog(0, "%s: node %u was down when sending "
+                        mlog(ML_NOTICE, "%s: node %u was down when sending "
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
@@ -2660,11 +2666,12 @@ retry:
                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
                        /* this is now a serious problem, possibly ENOMEM
                         * in the network stack.  must retry */
                        mlog_errno(ret);
                        mlog(ML_ERROR, "begin reco of dlm %s to node %u "
-                            " returned %d\n", dlm->name, nodenum, ret);
+                             "returned %d\n", dlm->name, nodenum, ret);
                        res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
                                                 DLM_RECOVERY_LOCK_NAME_LEN);
                        if (res) {
@@ -2789,7 +2796,9 @@ stage2:
                if (ret >= 0)
                        ret = status;
                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(ML_ERROR, "Error %d when sending message %u (key "
+                             "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
+                             dlm->key, nodenum);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery
                                 * session, so set the status to zero to
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 11a6d1fd1d35..d4f73ca68fe5 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -309,6 +309,7 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
         * spinlock, and because we know that it is not migrating/
         * recovering/in-progress, it is fine to reserve asts and
         * basts right before queueing them all throughout */
+        assert_spin_locked(&dlm->ast_lock);
        assert_spin_locked(&res->spinlock);
        BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
                              DLM_LOCK_RES_RECOVERING|
@@ -337,7 +338,7 @@ converting:
                        /* queue the BAST if not already */
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        /* update the highest_blocked if needed */
                        if (lock->ml.highest_blocked < target->ml.convert_type)
@@ -355,7 +356,7 @@ converting:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.convert_type)
                                lock->ml.highest_blocked =
@@ -383,7 +384,7 @@ converting:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -402,7 +403,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -418,7 +419,7 @@ blocked:
                        can_grant = 0;
                        if (lock->ml.highest_blocked == LKM_IVMODE) {
                                __dlm_lockres_reserve_ast(res);
-                                dlm_queue_bast(dlm, lock);
+                                __dlm_queue_bast(dlm, lock);
                        }
                        if (lock->ml.highest_blocked < target->ml.type)
                                lock->ml.highest_blocked = target->ml.type;
@@ -444,7 +445,7 @@ blocked:
                spin_unlock(&target->spinlock);
                __dlm_lockres_reserve_ast(res);
-                dlm_queue_ast(dlm, target);
+                __dlm_queue_ast(dlm, target);
                /* go back and check for more */
                goto converting;
        }
@@ -674,6 +675,7 @@ static int dlm_thread(void *data)
                        /* lockres can be re-dirtied/re-added to the
                         * dirty_list in this gap, but that is ok */
+                        spin_lock(&dlm->ast_lock);
                        spin_lock(&res->spinlock);
                        if (res->owner != dlm->node_num) {
                                __dlm_print_one_lock_resource(res);
@@ -694,6 +696,7 @@ static int dlm_thread(void *data)
                                /* move it to the tail and keep going */
                                res->state &= ~DLM_LOCK_RES_DIRTY;
                                spin_unlock(&res->spinlock);
+                                spin_unlock(&dlm->ast_lock);
                                mlog(0, "delaying list shuffling for in-"
                                     "progress lockres %.*s, state=%d\n",
                                     res->lockname.len, res->lockname.name,
@@ -715,6 +718,7 @@ static int dlm_thread(void *data)
                        dlm_shuffle_lists(dlm, res);
                        res->state &= ~DLM_LOCK_RES_DIRTY;
                        spin_unlock(&res->spinlock);
+                        spin_unlock(&dlm->ast_lock);
                        dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index b47c1b92b82b..817287c6a6db 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -354,7 +354,8 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
                        mlog(0, "master was in-progress.  retry\n");
                ret = status;
        } else {
-                mlog_errno(tmpret);
+                mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
+                     "node %u\n", tmpret, DLM_UNLOCK_LOCK_MSG, dlm->key, owner);
                if (dlm_is_host_down(tmpret)) {
                        /* NOTE: this seems strange, but it is what we want.
                         * when the master goes down during a cancel or
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 50c4ee805da4..39eb16ac5f98 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3897,7 +3897,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
                oinfo->dqi_gi.dqi_free_entry =
                                        be32_to_cpu(lvb->lvb_free_entry);
        } else {
-                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
+                                                     oinfo->dqi_giblk, &bh);
                if (status) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5fbd9cea968..97e54b9e654b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -278,10 +278,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
        inode->i_atime = CURRENT_TIME;
        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ocfs2_journal_dirty(handle, bh);
-        ret = ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -430,9 +427,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, fe_bh);
+        ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0)
-                mlog_errno(status);
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -449,7 +444,6 @@ static int ocfs2_truncate_file(struct inode *inode,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_truncate_context *tc = NULL;
        mlog_entry("(inode = %llu, new_i_size = %llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -488,6 +482,9 @@ static int ocfs2_truncate_file(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_resv_discard(&osb->osb_la_resmap,
+                           &OCFS2_I(inode)->ip_la_data_resv);
        /*
         * The inode lock forced other nodes to sync and drop their
         * pages, which (correctly) happens even if we have a truncate
@@ -517,13 +514,7 @@ static int ocfs2_truncate_file(struct inode *inode,
                goto bail_unlock_sem;
        }
-        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
+        status = ocfs2_commit_truncate(osb, inode, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_unlock_sem;
-        }
-        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
        if (status < 0) {
                mlog_errno(status);
                goto bail_unlock_sem;
@@ -666,11 +657,7 @@ restarted_transaction:
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
@@ -946,9 +933,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
-        int qtype;
-        struct dquot *transfer_from[MAXQUOTAS] = { };
        struct dquot *transfer_to[MAXQUOTAS] = { };
+        int qtype;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -979,10 +965,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        if (status)
                return status;
+        if (is_quota_modification(inode, attr))
+                dquot_initialize(inode);
        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
        if (size_change) {
-                dquot_initialize(inode);
                status = ocfs2_rw_lock(inode, 1);
                if (status < 0) {
                        mlog_errno(status);
@@ -1032,9 +1018,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
                                                      USRQUOTA);
-                        transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
+                        if (!transfer_to[USRQUOTA]) {
-                                                        USRQUOTA);
-                        if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
                                status = -ESRCH;
                                goto bail_unlock;
                        }
@@ -1044,9 +1028,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
                                                      GRPQUOTA);
-                        transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
+                        if (!transfer_to[GRPQUOTA]) {
-                                                        GRPQUOTA);
-                        if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
                                status = -ESRCH;
                                goto bail_unlock;
                        }
@@ -1058,7 +1040,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        mlog_errno(status);
                        goto bail_unlock;
                }
-                status = dquot_transfer(inode, attr);
+                status = __dquot_transfer(inode, transfer_to);
                if (status < 0)
                        goto bail_commit;
        } else {
@@ -1098,10 +1080,8 @@ bail:
        brelse(bh);
        /* Release quota pointers in case we acquired them */
-        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++)
                dqput(transfer_to[qtype]);
-                dqput(transfer_from[qtype]);
-        }
        if (!status && attr->ia_valid & ATTR_MODE) {
                status = ocfs2_acl_chmod(inode);
@@ -1195,9 +1175,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        di = (struct ocfs2_dinode *) bh->b_data;
        di->i_mode = cpu_to_le16(inode->i_mode);
-        ret = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_trans:
        ocfs2_commit_trans(osb, handle);
@@ -1434,16 +1412,90 @@ out:
        return ret;
 }
+static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
+{
+        int i;
+        struct ocfs2_extent_rec *rec = NULL;
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) < pos)
+                        break;
+        }
+        return i;
+}
+/*
+ * Helper to calculate the punching pos and length in one run, we handle the
+ * following three cases in order:
+ *
+ * - remove the entire record
+ * - remove a partial record
+ * - no record needs to be removed (hole-punching completed)
+*/
+static void ocfs2_calc_trunc_pos(struct inode *inode,
+                                 struct ocfs2_extent_list *el,
+                                 struct ocfs2_extent_rec *rec,
+                                 u32 trunc_start, u32 *trunc_cpos,
+                                 u32 *trunc_len, u32 *trunc_end,
+                                 u64 *blkno, int *done)
+{
+        int ret = 0;
+        u32 coff, range;
+        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
+                *trunc_cpos = le32_to_cpu(rec->e_cpos);
+                /*
+                 * Skip holes if any.
+                 */
+                if (range < *trunc_end)
+                        *trunc_end = range;
+                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno);
+                *trunc_end = le32_to_cpu(rec->e_cpos);
+        } else if (range > trunc_start) {
+                *trunc_cpos = trunc_start;
+                *trunc_len = *trunc_end - trunc_start;
+                coff = trunc_start - le32_to_cpu(rec->e_cpos);
+                *blkno = le64_to_cpu(rec->e_blkno) +
+                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
+                *trunc_end = trunc_start;
+        } else {
+                /*
+                 * It may have two following possibilities:
+                 *
+                 * - last record has been removed
+                 * - trunc_start was within a hole
+                 *
+                 * both two cases mean the completion of hole punching.
+                 */
+                ret = 1;
+        }
+        *done = ret;
+}
 static int ocfs2_remove_inode_range(struct inode *inode,
                                    struct buffer_head *di_bh, u64 byte_start,
                                    u64 byte_len)
 {
-        int ret = 0;
+        int ret = 0, flags = 0, done = 0, i;
-        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
+        u32 cluster_in_el;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
        struct ocfs2_extent_tree et;
+        struct ocfs2_path *path = NULL;
+        struct ocfs2_extent_list *el = NULL;
+        struct ocfs2_extent_rec *rec = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -1469,17 +1521,35 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
+        /*
+         * For reflinks, we may need to CoW 2 clusters which might be
+         * partially zero'd later, if hole's start and end offset were
+         * within one cluster(means is not exactly aligned to clustersize).
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
-        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
-        if (trunc_len >= trunc_start)
+        cluster_in_el = trunc_end;
-                trunc_len -= trunc_start;
-        else
-                trunc_len = 0;
-        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             (unsigned long long)byte_start,
-             (unsigned long long)byte_len, trunc_start, trunc_len);
+             (unsigned long long)byte_len, trunc_start, trunc_end);
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
@@ -1487,31 +1557,79 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
-        cpos = trunc_start;
+        path = ocfs2_new_path_from_et(&et);
-        while (trunc_len) {
+        if (!path) {
-                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+                ret = -ENOMEM;
-                                         &alloc_size, NULL);
+                mlog_errno(ret);
+                goto out;
+        }
+        while (trunc_end > trunc_start) {
+                ret = ocfs2_find_path(INODE_CACHE(inode), path,
+                                      cluster_in_el);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                if (alloc_size > trunc_len)
+                el = path_leaf_el(path);
-                        alloc_size = trunc_len;
+                i = ocfs2_find_rec(el, trunc_end);
+                /*
+                 * Need to go to previous extent block.
+                 */
+                if (i < 0) {
+                        if (path->p_tree_depth == 0)
+                                break;
-                /* Only do work for non-holes */
+                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
-                if (phys_cpos != 0) {
+                                                            path,
-                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
+                                                            &cluster_in_el);
-                                                       phys_cpos, alloc_size,
-                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
+                        /*
+                         * We've reached the leftmost extent block,
+                         * it's safe to leave.
+                         */
+                        if (cluster_in_el == 0)
+                                break;
+                        /*
+                         * The 'pos' searched for previous extent block is
+                         * always one cluster less than actual trunc_end.
+                         */
+                        trunc_end = cluster_in_el + 1;
+                        ocfs2_reinit_path(path, 1);
+                        continue;
+                } else
+                        rec = &el->l_recs[i];
+                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
+                                     &trunc_len, &trunc_end, &blkno, &done);
+                if (done)
+                        break;
+                flags = rec->e_flags;
+                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
+                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
+                                               phys_cpos, trunc_len, flags,
+                                               &dealloc, refcount_loc);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
                }
-                cpos += alloc_size;
+                cluster_in_el = trunc_end;
-                trunc_len -= alloc_size;
+                ocfs2_reinit_path(path, 1);
        }
        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index af189887201c..abb0a95cc717 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -376,6 +376,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        OCFS2_I(inode)->ip_last_used_slot = 0;
        OCFS2_I(inode)->ip_last_used_group = 0;
+        if (S_ISDIR(inode->i_mode))
+                ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
+                                    OCFS2_RESV_FLAG_DIR);
        mlog_exit_void();
 }
@@ -539,7 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                     struct buffer_head *fe_bh)
 {
        int status = 0;
-        struct ocfs2_truncate_context *tc = NULL;
        struct ocfs2_dinode *fe;
        handle_t *handle = NULL;
@@ -582,13 +585,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
-                status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+                status = ocfs2_commit_truncate(osb, inode, fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto out;
-                }
-                status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -659,12 +656,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
        di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
+        ocfs2_journal_dirty(handle, di_bh);
-        status = ocfs2_journal_dirty(handle, di_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail_commit;
-        }
        ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
        dquot_free_inode(inode);
@@ -980,7 +972,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
 void ocfs2_delete_inode(struct inode *inode)
 {
        int wipe, status;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
@@ -1007,13 +999,7 @@ void ocfs2_delete_inode(struct inode *inode)
         * messaging paths may return us -ERESTARTSYS. Which would
         * cause us to exit early, resulting in inodes being orphaned
         * forever. */
-        sigfillset(&blocked);
+        ocfs2_block_signals(&oldset);
-        status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_cleanup_delete_inode(inode, 1);
-                goto bail;
-        }
        /*
         * Synchronize us against ocfs2_get_dentry. We take this in
@@ -1087,9 +1073,7 @@ bail_unlock_nfs_sync:
        ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
 bail_unblock:
-        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
+        ocfs2_unblock_signals(&oldset);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        clear_inode(inode);
        mlog_exit_void();
@@ -1123,6 +1107,10 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+        ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
+                           &oi->ip_la_data_resv);
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
         * locks until the journal has finished with it. The only
@@ -1298,13 +1286,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0)
-                mlog_errno(status);
-        status = 0;
 leave:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 0b28e1921a39..9f5f5fcadc45 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -70,6 +70,8 @@ struct ocfs2_inode_info
        /* Only valid if the inode is the dir. */
        u32                             ip_last_used_slot;
        u64                             ip_last_used_group;
+        struct ocfs2_alloc_reservation  ip_la_data_resv;
 };
 /*
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9336c60e3a36..47878cf16418 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -402,9 +402,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 }
 /*
- * 'nblocks' is what you want to add to the current
+ * 'nblocks' is what you want to add to the current transaction.
- * transaction. extend_trans will either extend the current handle by
- * nblocks, or commit it and start a new one with nblocks credits.
 *
 * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
@@ -422,11 +420,15 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 */
 int ocfs2_extend_trans(handle_t *handle, int nblocks)
 {
-        int status;
+        int status, old_nblocks;
        BUG_ON(!handle);
-        BUG_ON(!nblocks);
+        BUG_ON(nblocks < 0);
+        if (!nblocks)
+                return 0;
+        old_nblocks = handle->h_buffer_credits;
        mlog_entry_void();
        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
@@ -445,7 +447,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
                mlog(0,
                     "jbd2_journal_extend failed, trying "
                     "jbd2_journal_restart\n");
-                status = jbd2_journal_restart(handle, nblocks);
+                status = jbd2_journal_restart(handle,
+                                              old_nblocks + nblocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -734,8 +737,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
        return __ocfs2_journal_access(handle, ci, bh, NULL, type);
 }
-int ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
-                        struct buffer_head *bh)
 {
        int status;
@@ -743,13 +745,9 @@ int ocfs2_journal_dirty(handle_t *handle,
                   (unsigned long long)bh->b_blocknr);
        status = jbd2_journal_dirty_metadata(handle, bh);
-        if (status < 0)
+        BUG_ON(status);
-                mlog(ML_ERROR, "Could not dirty metadata buffer. "
-                     "(bh->b_blocknr=%llu)\n",
-                     (unsigned long long)bh->b_blocknr);
-        mlog_exit(status);
+        mlog_exit_void();
-        return status;
 }
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3f74e09b0d80..b5baaa8e710f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -325,8 +325,7 @@ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
 *      <modify the bh>
 *      ocfs2_journal_dirty(handle, bh);
 */
-int                  ocfs2_journal_dirty(handle_t *handle,
+void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh);
-                                         struct buffer_head *bh);
 /*
 *  Credit Macros:
@@ -562,6 +561,18 @@ static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
        return blocks;
 }
+/*
+ * Allocating a discontiguous block group requires the credits from
+ * ocfs2_calc_group_alloc_credits() as well as enough credits to fill
+ * the group descriptor's extent list.  The caller already has started
+ * the transaction with ocfs2_calc_group_alloc_credits().  They extend
+ * it with these credits.
+ */
+static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
+{
+        return ocfs2_extent_recs_per_gd(sb);
+}
 static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
                                                unsigned int clusters_to_del,
                                                struct ocfs2_dinode *fe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index c983715d8d8c..3d7419682dc0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -52,7 +52,8 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                                             struct ocfs2_dinode *alloc,
-                                             u32 numbits);
+                                             u32 *numbits,
+                                             struct ocfs2_alloc_reservation *resv);
 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
@@ -74,6 +75,144 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ *   group descriptors for other allocations (such as block groups,
+ *   etc). Picking default sizes which are a multiple of 4 could help
+ *   - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ *   file systems. This can easily be taken care of by limiting our
+ *   default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ *   particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K    group: 126M     la: 121M
+ * csize: 8K    group: 252M     la: 243M
+ * csize: 16K   group: 504M     la: 486M
+ * csize: 32K   group: 1008M    la: 972M
+ * csize: 64K   group: 2016M    la: 1944M
+ * csize: 128K  group: 4032M    la: 3888M
+ * csize: 256K  group: 8064M    la: 7776M
+ * csize: 512K  group: 16128M   la: 15552M
+ * csize: 1024K group: 32256M   la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT    8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+        unsigned int la_mb;
+        unsigned int gd_mb;
+        unsigned int megs_per_slot;
+        struct super_block *sb = osb->sb;
+        gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+                8 * ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat));
+        /*
+         * This takes care of files systems with very small group
+         * descriptors - 512 byte blocksize at cluster sizes lower
+         * than 16K and also 1k blocksize with 4k cluster size.
+         */
+        if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+            || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+                return OCFS2_LA_OLD_DEFAULT;
+        /*
+         * Leave enough room for some block groups and make the final
+         * value we work from a multiple of 4.
+         */
+        gd_mb -= 16;
+        gd_mb &= 0xFFFFFFFB;
+        la_mb = gd_mb;
+        /*
+         * Keep window sizes down to a reasonable default
+         */
+        if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+                /*
+                 * Some clustersize / blocksize combinations will have
+                 * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+                 * default size, but get poor distribution when
+                 * limited to exactly 256 megabytes.
+                 *
+                 * As an example, 16K clustersize at 4K blocksize
+                 * gives us a cluster group size of 504M. Paring the
+                 * local alloc size down to 256 however, would give us
+                 * only one window and around 200MB left in the
+                 * cluster group. Instead, find the first size below
+                 * 256 which would give us an even distribution.
+                 *
+                 * Larger cluster group sizes actually work out pretty
+                 * well when pared to 256, so we don't have to do this
+                 * for any group that fits more than two
+                 * OCFS2_LA_MAX_DEFAULT_MB windows.
+                 */
+                if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+                        la_mb = 256;
+                else {
+                        unsigned int gd_mult = gd_mb;
+                        while (gd_mult > 256)
+                                gd_mult = gd_mult >> 1;
+                        la_mb = gd_mult;
+                }
+        }
+        megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+        megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+        /* Too many nodes, too few disk clusters. */
+        if (megs_per_slot < la_mb)
+                la_mb = megs_per_slot;
+        return la_mb;
+}
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+        struct super_block *sb = osb->sb;
+        unsigned int la_default_mb = ocfs2_la_default_mb(osb);
+        unsigned int la_max_mb;
+        la_max_mb = ocfs2_clusters_to_megabytes(sb,
+                                                ocfs2_local_alloc_size(sb) * 8);
+        mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+             requested_mb, la_max_mb, la_default_mb);
+        if (requested_mb == -1) {
+                /* No user request - use defaults */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_default_mb);
+        } else if (requested_mb > la_max_mb) {
+                /* Request is too big, we give the maximum available */
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, la_max_mb);
+        } else {
+                osb->local_alloc_default_bits =
+                        ocfs2_megabytes_to_clusters(sb, requested_mb);
+        }
+        osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -156,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
                osb->local_alloc_bits =
                        ocfs2_megabytes_to_clusters(osb->sb,
-                                                    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+                                                    ocfs2_la_default_mb(osb));
        }
        /* read the alloc off disk */
@@ -262,6 +401,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        osb->local_alloc_state = OCFS2_LA_DISABLED;
+        ocfs2_resmap_uninit(&osb->osb_la_resmap);
        main_bm_inode = ocfs2_get_system_file_inode(osb,
                                                    GLOBAL_BITMAP_SYSTEM_INODE,
                                                    OCFS2_INVALID_SLOT);
@@ -305,12 +446,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, bh);
-        status = ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_commit;
-        }
        brelse(bh);
        osb->local_alloc_bh = NULL;
@@ -481,46 +617,6 @@ out:
        return status;
 }
-/* Check to see if the local alloc window is within ac->ac_max_block */
-static int ocfs2_local_alloc_in_range(struct inode *inode,
-                                      struct ocfs2_alloc_context *ac,
-                                      u32 bits_wanted)
-{
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *alloc;
-        struct ocfs2_local_alloc *la;
-        int start;
-        u64 block_off;
-        if (!ac->ac_max_block)
-                return 1;
-        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
-        if (start == -1) {
-                mlog_errno(-ENOSPC);
-                return 0;
-        }
-        /*
-         * Converting (bm_off + start + bits_wanted) to blocks gives us
-         * the blkno just past our actual allocation.  This is perfect
-         * to compare with ac_max_block.
-         */
-        block_off = ocfs2_clusters_to_blocks(inode->i_sb,
-                                             le32_to_cpu(la->la_bm_off) +
-                                             start + bits_wanted);
-        mlog(0, "Checking %llu against %llu\n",
-             (unsigned long long)block_off,
-             (unsigned long long)ac->ac_max_block);
-        if (block_off > ac->ac_max_block)
-                return 0;
-        return 1;
-}
 /*
 * make sure we've got at least bits_wanted contiguous bits in the
 * local alloc. You lose them when you drop i_mutex.
@@ -613,17 +709,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                mlog(0, "Calling in_range for max block %llu\n",
                     (unsigned long long)ac->ac_max_block);
-        if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
-                                        bits_wanted)) {
-                /*
-                 * The window is outside ac->ac_max_block.
-                 * This errno tells the caller to keep localalloc enabled
-                 * but to get the allocation from the main bitmap.
-                 */
-                status = -EFBIG;
-                goto bail;
-        }
        ac->ac_inode = local_alloc_inode;
        /* We should never use localalloc from another slot */
        ac->ac_alloc_slot = osb->slot_num;
@@ -664,7 +749,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
        la = OCFS2_LOCAL_ALLOC(alloc);
-        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, &bits_wanted,
+                                                  ac->ac_resv);
        if (start == -1) {
                /* TODO: Shouldn't we just BUG here? */
                status = -ENOSPC;
@@ -674,8 +760,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        bitmap = la->la_bitmap;
        *bit_off = le32_to_cpu(la->la_bm_off) + start;
-        /* local alloc is always contiguous by nature -- we never
-         * delete bits from it! */
        *num_bits = bits_wanted;
        status = ocfs2_journal_access_di(handle,
@@ -687,18 +771,15 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        ocfs2_resmap_claimed_bits(&osb->osb_la_resmap, ac->ac_resv, start,
+                                  bits_wanted);
        while(bits_wanted--)
                ocfs2_set_bit(start++, bitmap);
        le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
-        status = 0;
 bail:
        mlog_exit(status);
        return status;
@@ -722,13 +803,17 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 }
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
-                                             struct ocfs2_dinode *alloc,
+                                     struct ocfs2_dinode *alloc,
-                                             u32 numbits)
+                                     u32 *numbits,
+                                     struct ocfs2_alloc_reservation *resv)
 {
        int numfound, bitoff, left, startoff, lastzero;
+        int local_resv = 0;
+        struct ocfs2_alloc_reservation r;
        void *bitmap = NULL;
+        struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
-        mlog_entry("(numbits wanted = %u)\n", numbits);
+        mlog_entry("(numbits wanted = %u)\n", *numbits);
        if (!alloc->id1.bitmap1.i_total) {
                mlog(0, "No bits in my window!\n");
@@ -736,6 +821,30 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                goto bail;
        }
+        if (!resv) {
+                local_resv = 1;
+                ocfs2_resv_init_once(&r);
+                ocfs2_resv_set_type(&r, OCFS2_RESV_FLAG_TMP);
+                resv = &r;
+        }
+        numfound = *numbits;
+        if (ocfs2_resmap_resv_bits(resmap, resv, &bitoff, &numfound) == 0) {
+                if (numfound < *numbits)
+                        *numbits = numfound;
+                goto bail;
+        }
+        /*
+         * Code error. While reservations are enabled, local
+         * allocation should _always_ go through them.
+         */
+        BUG_ON(osb->osb_resv_level != 0);
+        /*
+         * Reservations are disabled. Handle this the old way.
+         */
        bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
        numfound = bitoff = startoff = 0;
@@ -761,7 +870,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                        startoff = bitoff+1;
                }
                /* we got everything we needed */
-                if (numfound == numbits) {
+                if (numfound == *numbits) {
                        /* mlog(0, "Found it all!\n"); */
                        break;
                }
@@ -770,12 +879,15 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
        mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
             numfound);
-        if (numfound == numbits)
+        if (numfound == *numbits)
                bitoff = startoff - numfound;
        else
                bitoff = -1;
 bail:
+        if (local_resv)
+                ocfs2_resv_discard(resmap, resv);
        mlog_exit(bitoff);
        return bitoff;
 }
@@ -1049,7 +1161,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        /* we used the generic suballoc reserve function, but we set
         * everything up nicely, so there's no reason why we can't use
         * the more specific cluster api to claim bits. */
-        status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
+        status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
                                      &cluster_off, &cluster_count);
        if (status == -ENOSPC) {
 retry_enospc:
@@ -1063,7 +1175,7 @@ retry_enospc:
                        goto bail;
                ac->ac_bits_wanted = osb->local_alloc_default_bits;
-                status = ocfs2_claim_clusters(osb, handle, ac,
+                status = ocfs2_claim_clusters(handle, ac,
                                              osb->local_alloc_bits,
                                              &cluster_off,
                                              &cluster_count);
@@ -1098,6 +1210,9 @@ retry_enospc:
        memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
               le16_to_cpu(la->la_size));
+        ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
+                             OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
        mlog(0, "New window allocated:\n");
        mlog(0, "window la_bm_off = %u\n",
             OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
@@ -1169,12 +1284,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
                                          main_bm_inode, main_bm_bh);
@@ -1192,7 +1302,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        atomic_inc(&osb->alloc_stats.moves);
-        status = 0;
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f86653..1be9b5864460 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,9 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                                     int node_num,
                                     struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7898bd3a99f5..af2b8fe1f139 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -41,44 +41,20 @@
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
+#include "super.h"
-static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
-{
-        /* The best way to deal with signals in the vm path is
-         * to block them upfront, rather than allowing the
-         * locking paths to return -ERESTARTSYS. */
-        sigfillset(blocked);
-        /* We should technically never get a bad return value
-         * from sigprocmask */
-        return sigprocmask(SIG_BLOCK, blocked, oldset);
-}
-static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
-{
-        return sigprocmask(SIG_SETMASK, oldset, NULL);
-}
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int error, ret;
+        int ret;
        mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-        error = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (error < 0) {
-                mlog_errno(error);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = filemap_fault(area, vmf);
+        ocfs2_unblock_signals(&oldset);
-        error = ocfs2_vm_op_unblock_sigs(&oldset);
-        if (error < 0)
-                mlog_errno(error);
-out:
        mlog_exit_ptr(vmf->page);
        return ret;
 }
@@ -158,14 +134,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
-        sigset_t blocked, oldset;
+        sigset_t oldset;
-        int ret, ret2;
+        int ret;
-        ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+        ocfs2_block_signals(&oldset);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        /*
         * The cluster locks taken will block a truncate from another
@@ -193,9 +165,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ocfs2_inode_unlock(inode, 1);
 out:
-        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+        ocfs2_unblock_signals(&oldset);
-        if (ret2 < 0)
-                mlog_errno(ret2);
        if (ret)
                ret = VM_FAULT_SIGBUS;
        return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4cbb18f26c5f..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
                inode->i_nlink = 2;
        else
                inode->i_nlink = 1;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        dquot_initialize(inode);
        return inode;
 }
@@ -239,6 +232,8 @@ static int ocfs2_mknod(struct inode *dir,
        };
        int did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -350,6 +345,10 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto leave;
@@ -384,11 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
                ocfs2_add_links_count(dirfe, 1);
-                status = ocfs2_journal_dirty(handle, parent_fe_bh);
+                ocfs2_journal_dirty(handle, parent_fe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto leave;
-                }
                inc_nlink(dir);
        }
@@ -439,6 +434,8 @@ leave:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
@@ -487,14 +484,15 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        int status = 0;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
-        u64 fe_blkno = 0;
+        u64 suballoc_loc, fe_blkno = 0;
        u16 suballoc_bit;
        u16 feat;
        *new_fe_bh = NULL;
-        status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
+        status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh,
-                                       inode_ac, &suballoc_bit, &fe_blkno);
+                                       inode_ac, &suballoc_loc,
+                                       &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -531,6 +529,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_generation = cpu_to_le32(inode->i_generation);
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
+        fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(inode->i_uid);
@@ -567,11 +566,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
        }
-        status = ocfs2_journal_dirty(handle, *new_fe_bh);
+        ocfs2_journal_dirty(handle, *new_fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        ocfs2_populate_inode(inode, fe, 1);
        ocfs2_ci_set_new(osb, INODE_CACHE(inode));
@@ -637,6 +632,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -693,6 +689,9 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
        err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
@@ -705,14 +704,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_journal_dirty(handle, fe_bh);
-        err = ocfs2_journal_dirty(handle, fe_bh);
-        if (err < 0) {
-                ocfs2_add_links_count(fe, -1);
-                drop_nlink(inode);
-                mlog_errno(err);
-                goto out_commit;
-        }
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
@@ -736,6 +728,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 out_commit:
        ocfs2_commit_trans(osb, handle);
+        ocfs2_unblock_signals(&oldset);
 out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
@@ -909,12 +902,7 @@ static int ocfs2_unlink(struct inode *dir,
                drop_nlink(inode);
        drop_nlink(inode);
        ocfs2_set_links_count(fe, inode->i_nlink);
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (S_ISDIR(inode->i_mode))
@@ -1332,12 +1320,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        ocfs2_set_links_count(newfe, 0);
                else
                        ocfs2_add_links_count(newfe, -1);
+                ocfs2_journal_dirty(handle, newfe_bh);
-                status = ocfs2_journal_dirty(handle, newfe_bh);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        } else {
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1356,10 +1339,7 @@ static int ocfs2_rename(struct inode *old_dir,
                old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
                old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(handle, old_inode_bh);
-                status = ocfs2_journal_dirty(handle, old_inode_bh);
-                if (status < 0)
-                        mlog_errno(status);
        } else
                mlog_errno(status);
@@ -1431,7 +1411,7 @@ static int ocfs2_rename(struct inode *old_dir,
                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        ocfs2_set_links_count(fe, old_dir->i_nlink);
-                        status = ocfs2_journal_dirty(handle, old_dir_bh);
+                        ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
@@ -1563,11 +1543,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                       (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
                       bytes_left);
-                status = ocfs2_journal_dirty(handle, bhs[virtual]);
+                ocfs2_journal_dirty(handle, bhs[virtual]);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
                virtual++;
                p_blkno++;
@@ -1611,6 +1587,8 @@ static int ocfs2_symlink(struct inode *dir,
        };
        int did_quota = 0, did_quota_inode = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
+        sigset_t oldset;
+        int did_block_signals = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1706,6 +1684,10 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        /* Starting to change things, restart is no longer possible. */
+        ocfs2_block_signals(&oldset);
+        did_block_signals = 1;
        status = dquot_alloc_inode(inode);
        if (status)
                goto bail;
@@ -1814,6 +1796,8 @@ bail:
                ocfs2_commit_trans(osb, handle);
        ocfs2_inode_unlock(dir, 1);
+        if (did_block_signals)
+                ocfs2_unblock_signals(&oldset);
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
@@ -1961,12 +1945,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, 1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
@@ -2065,12 +2044,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        if (S_ISDIR(inode->i_mode))
                ocfs2_add_links_count(orphan_fe, -1);
        orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
+        ocfs2_journal_dirty(handle, orphan_dir_bh);
-        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
 leave:
        ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index adf5e2ebc2c4..c67003b6b5a2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,7 @@
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
+#include "reservations.h"
 /* Caching of metadata buffers */
@@ -341,6 +342,9 @@ struct ocfs2_super
         */
        unsigned int local_alloc_bits;
        unsigned int local_alloc_default_bits;
+        /* osb_clusters_at_boot can become stale! Do not trust it to
+         * be up to date. */
+        unsigned int osb_clusters_at_boot;
        enum ocfs2_local_alloc_state local_alloc_state; /* protected
                                                         * by osb_lock */
@@ -349,6 +353,11 @@ struct ocfs2_super
        u64 la_last_gd;
+        struct ocfs2_reservation_map    osb_la_resmap;
+        unsigned int    osb_resv_level;
+        unsigned int    osb_dir_resv_level;
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -482,6 +491,13 @@ static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_discontig_bg(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+                return 1;
+        return 0;
+}
 static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
 {
        if (ocfs2_supports_indexed_dirs(osb))
@@ -763,6 +779,12 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+                                                       unsigned int clusters)
+{
+        return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
        ext2_set_bit(bit, bitmap);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bb37218a7978..33f1c9a8258d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -100,7 +100,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
                                         | OCFS2_FEATURE_INCOMPAT_META_ECC \
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
-                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
+                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -165,6 +166,9 @@
 /* Refcount tree support */
 #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE    0x1000
+/* Discontigous block groups */
+#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG     0x2000
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -283,14 +287,6 @@
 #define OCFS2_MIN_JOURNAL_SIZE          (4 * 1024 * 1024)
 /*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
-/*
 * Inline extended attribute size (in bytes)
 * The value chosen should be aligned to 16 byte boundaries.
 */
@@ -512,7 +508,10 @@ struct ocfs2_extent_block
                                           block group */
        __le32 h_fs_generation;         /* Must match super block */
        __le64 h_blkno;                 /* Offset on disk, in blocks */
-/*20*/  __le64 h_reserved3;
+/*20*/  __le64 h_suballoc_loc;          /* Suballocator block group this
+                                           eb belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
        __le64 h_next_leaf_blk;         /* Offset on disk, in blocks,
                                           of next leaf header pointing
                                           to data */
@@ -679,7 +678,11 @@ struct ocfs2_dinode {
 /*80*/  struct ocfs2_block_check i_check;       /* Error checking */
 /*88*/  __le64 i_dx_root;               /* Pointer to dir index root block */
 /*90*/  __le64 i_refcount_loc;
-        __le64 i_reserved2[4];
+        __le64 i_suballoc_loc;          /* Suballocator block group this
+                                           inode belongs to.  Only valid
+                                           if allocated from a
+                                           discontiguous block group */
+/*A0*/  __le64 i_reserved2[3];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -814,7 +817,12 @@ struct ocfs2_dx_root_block {
        __le32          dr_reserved2;
        __le64          dr_free_blk;            /* Pointer to head of free
                                                 * unindexed block list. */
-        __le64          dr_reserved3[15];
+        __le64          dr_suballoc_loc;        /* Suballocator block group
+                                                   this root belongs to.
+                                                   Only valid if allocated
+                                                   from a discontiguous
+                                                   block group */
+        __le64          dr_reserved3[14];
        union {
                struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
                                                   * bits for maximum space
@@ -840,6 +848,13 @@ struct ocfs2_dx_leaf {
 };
 /*
+ * Largest bitmap for a block (suballocator) group in bytes.  This limit
+ * does not affect cluster groups (global allocator).  Cluster group
+ * bitmaps run to the end of the block.
+ */
+#define OCFS2_MAX_BG_BITMAP_SIZE        256
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -860,7 +875,29 @@ struct ocfs2_group_desc
        __le64   bg_blkno;               /* Offset on disk, in blocks */
 /*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
        __le64   bg_reserved2;
-/*40*/  __u8    bg_bitmap[0];
+/*40*/  union {
+                __u8    bg_bitmap[0];
+                struct {
+                        /*
+                         * Block groups may be discontiguous when
+                         * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set.
+                         * The extents of a discontigous block group are
+                         * stored in bg_list.  It is a flat list.
+                         * l_tree_depth must always be zero.  A
+                         * discontiguous group is signified by a non-zero
+                         * bg_list->l_next_free_rec.  Only block groups
+                         * can be discontiguous; Cluster groups cannot.
+                         * We've never made a block group with more than
+                         * 2048 blocks (256 bytes of bg_bitmap).  This
+                         * codifies that limit so that we can fit bg_list.
+                         * bg_size of a discontiguous block group will
+                         * be 256 to match bg_bitmap_filler.
+                         */
+                        __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE];
+/*140*/                 struct ocfs2_extent_list bg_list;
+                };
+        };
+/* Actual on-disk size is one block */
 };
 struct ocfs2_refcount_rec {
@@ -905,7 +942,11 @@ struct ocfs2_refcount_block {
 /*40*/  __le32 rf_generation;           /* generation number. all be the same
                                         * for the same refcount tree. */
        __le32 rf_reserved0;
-        __le64 rf_reserved1[7];
+        __le64 rf_suballoc_loc;         /* Suballocator block group this
+                                           refcount block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
+/*50*/  __le64 rf_reserved1[6];
 /*80*/  union {
                struct ocfs2_refcount_list rf_records;  /* List of refcount
                                                          records */
@@ -1017,7 +1058,10 @@ struct ocfs2_xattr_block {
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
        __le32  xb_reserved1;
-        __le64  xb_reserved2;
+        __le64  xb_suballoc_loc;        /* Suballocator block group this
+                                           xattr block belongs to. Only
+                                           valid if allocated from a
+                                           discontiguous block group */
 /*30*/  union {
                struct ocfs2_xattr_header xb_header; /* xattr header if this
                                                        block contains xattr */
@@ -1254,6 +1298,16 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
 {
        int size;
@@ -1284,13 +1338,23 @@ static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
        return size;
 }
-static inline int ocfs2_group_bitmap_size(struct super_block *sb)
+static inline int ocfs2_group_bitmap_size(struct super_block *sb,
+                                          int suballocator,
+                                          u32 feature_incompat)
 {
-        int size;
+        int size = sb->s_blocksize -
-        size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1402,23 +1466,43 @@ static inline int ocfs2_extent_recs_per_eb(int blocksize)
        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_local_alloc_size(int blocksize)
+static inline int ocfs2_extent_recs_per_gd(int blocksize)
 {
        int size;
        size = blocksize -
-                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+                offsetof(struct ocfs2_group_desc, bg_list.l_recs);
-        return size;
+        return size / sizeof(struct ocfs2_extent_rec);
 }
-static inline int ocfs2_group_bitmap_size(int blocksize)
+static inline int ocfs2_local_alloc_size(int blocksize)
 {
        int size;
        size = blocksize -
+                offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
+        return size;
+}
+static inline int ocfs2_group_bitmap_size(int blocksize,
+                                          int suballocator,
+                                          uint32_t feature_incompat)
+{
+        int size = sb->s_blocksize -
                offsetof(struct ocfs2_group_desc, bg_bitmap);
+        /*
+         * The cluster allocator uses the entire block.  Suballocators have
+         * never used more than OCFS2_MAX_BG_BITMAP_SIZE.  Unfortunately, older
+         * code expects bg_size set to the maximum.  Thus we must keep
+         * bg_size as-is unless discontig_bg is enabled.
+         */
+        if (suballocator &&
+            (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG))
+                size = OCFS2_MAX_BG_BITMAP_SIZE;
        return size;
 }
@@ -1491,5 +1575,19 @@ static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
        de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
+static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
+{
+        if ((offsetof(struct ocfs2_group_desc, bg_bitmap) +
+             le16_to_cpu(gd->bg_size)) !=
+            offsetof(struct ocfs2_group_desc, bg_list))
+                return 0;
+        /*
+         * Only valid to check l_next_free_rec if
+         * bg_bitmap + bg_size == bg_list.
+         */
+        if (!gd->bg_list.l_next_free_rec)
+                return 0;
+        return 1;
+}
 #endif  /* _OCFS2_FS_H */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 123bc520a2c0..196fcb52d95d 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -23,6 +23,7 @@
 struct ocfs2_dquot {
        struct dquot dq_dquot;  /* Generic VFS dquot */
        loff_t dq_local_off;    /* Offset in the local quota file */
+        u64 dq_local_phys_blk;  /* Physical block carrying quota structure */
        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
        s64 dq_origspace;       /* Last globally synced space usage */
@@ -51,8 +52,9 @@ struct ocfs2_mem_dqinfo {
        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        u64 dqi_giblk;                  /* Number of block with global information header */
        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
-        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct buffer_head *dqi_libh;   /* Buffer with local information header */
        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
@@ -102,8 +104,12 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
 int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh);
-                           struct buffer_head **bh);
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
+                                struct buffer_head **bh);
+int ocfs2_create_local_dquot(struct dquot *dquot);
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
+int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ab42a74c7539..2bb35fe00511 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -25,8 +25,44 @@
 #include "dlmglue.h"
 #include "uptodate.h"
 #include "super.h"
+#include "buffer_head_io.h"
 #include "quota.h"
+/*
+ * Locking of quotas with OCFS2 is rather complex. Here are rules that
+ * should be obeyed by all the functions:
+ * - any write of quota structure (either to local or global file) is protected
+ *   by dqio_mutex or dquot->dq_lock.
+ * - any modification of global quota file holds inode cluster lock, i_mutex,
+ *   and ip_alloc_sem of the global quota file (achieved by
+ *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
+ * - an allocation of new blocks for local quota file is protected by
+ *   its ip_alloc_sem
+ *
+ * A rough sketch of locking dependencies (lf = local file, gf = global file):
+ * Normal filesystem operation:
+ *   start_trans -> dqio_mutex -> write to lf
+ * Syncing of local and global file:
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     write to gf
+ *                                                     -> write to lf
+ * Acquire dquot for the first time:
+ *   dq_lock -> ocfs2_lock_global_qf -> qinfo_lock -> read from gf
+ *                                   -> alloc space for gf
+ *                                   -> start_trans -> qinfo_lock -> write to gf
+ *           -> ip_alloc_sem of lf -> alloc space for lf
+ *           -> write to lf
+ * Release last reference to dquot:
+ *   dq_lock -> ocfs2_lock_global_qf -> start_trans -> qinfo_lock -> write to gf
+ *           -> write to lf
+ * Note that all the above operations also hold the inode cluster lock of lf.
+ * Recovery:
+ *   inode cluster lock of recovered lf
+ *     -> read bitmaps -> ip_alloc_sem of lf
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *        write to gf
+ */
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
@@ -91,8 +127,7 @@ struct qtree_fmt_operations ocfs2_global_ops = {
        .is_id = ocfs2_global_is_id,
 };
-static int ocfs2_validate_quota_block(struct super_block *sb,
+int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
-                                      struct buffer_head *bh)
 {
        struct ocfs2_disk_dqtrailer *dqt =
                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
@@ -110,54 +145,19 @@ static int ocfs2_validate_quota_block(struct super_block *sb,
        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
 }
-int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
-                           struct buffer_head **bh)
+                                struct buffer_head **bhp)
 {
-        int rc = 0;
+        int rc;
-        struct buffer_head *tmp = *bh;
+        *bhp = NULL;
-        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+        rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, 1, bhp, 0,
-                ocfs2_error(inode->i_sb,
+                               ocfs2_validate_quota_block);
-                            "Quota file %llu is probably corrupted! Requested "
-                            "to read block %Lu but file has size only %Lu\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            (unsigned long long)v_block,
-                            (unsigned long long)i_size_read(inode));
-                return -EIO;
-        }
-        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
-                                    ocfs2_validate_quota_block);
        if (rc)
                mlog_errno(rc);
-        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
-        if (!rc && !*bh)
-                *bh = tmp;
        return rc;
 }
-static int ocfs2_get_quota_block(struct inode *inode, int block,
-                                 struct buffer_head **bh)
-{
-        u64 pblock, pcount;
-        int err;
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-        if (err) {
-                mlog_errno(err);
-                return err;
-        }
-        *bh = sb_getblk(inode->i_sb, pblock);
-        if (!*bh) {
-                err = -EIO;
-                mlog_errno(err);
-        }
-        return err;
-}
 /* Read data from global quotafile - avoid pagecache and such because we cannot
 * afford acquiring the locks... We use quota cluster lock to serialize
 * operations. Caller is responsible for acquiring it. */
@@ -172,6 +172,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
        int err = 0;
        struct buffer_head *bh;
        size_t toread, tocopy;
+        u64 pblock = 0, pcount = 0;
        if (off > i_size)
                return 0;
@@ -180,8 +181,19 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
        toread = len;
        while (toread > 0) {
                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                if (!pcount) {
+                        err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock,
+                                                          &pcount, NULL);
+                        if (err) {
+                                mlog_errno(err);
+                                return err;
+                        }
+                } else {
+                        pcount--;
+                        pblock++;
+                }
                bh = NULL;
-                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
                if (err) {
                        mlog_errno(err);
                        return err;
@@ -209,6 +221,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        int err = 0, new = 0, ja_type;
        struct buffer_head *bh = NULL;
        handle_t *handle = journal_current_handle();
+        u64 pblock, pcount;
        if (!handle) {
                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
@@ -221,12 +234,11 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
        }
-        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
        if (gqinode->i_size < off + len) {
                loff_t rounded_end =
                                ocfs2_align_bytes_to_blocks(sb, off + len);
-                /* Space is already allocated in ocfs2_global_read_dquot() */
+                /* Space is already allocated in ocfs2_acquire_dquot() */
                err = ocfs2_simple_size_update(gqinode,
                                               oinfo->dqi_gqi_bh,
                                               rounded_end);
@@ -234,13 +246,20 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                        goto out;
                new = 1;
        }
+        err = ocfs2_extent_map_get_blocks(gqinode, blk, &pblock, &pcount, NULL);
+        if (err) {
+                mlog_errno(err);
+                goto out;
+        }
        /* Not rewriting whole block? */
        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
            !new) {
-                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                err = ocfs2_read_quota_phys_block(gqinode, pblock, &bh);
                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
        } else {
-                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                bh = sb_getblk(sb, pblock);
+                if (!bh)
+                        err = -ENOMEM;
                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
        }
        if (err) {
@@ -261,19 +280,15 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
                brelse(bh);
                goto out;
        }
-        err = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
        brelse(bh);
-        if (err < 0)
-                goto out;
 out:
        if (err) {
-                mutex_unlock(&gqinode->i_mutex);
                mlog_errno(err);
                return err;
        }
        gqinode->i_version++;
        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
-        mutex_unlock(&gqinode->i_mutex);
        return len;
 }
@@ -291,11 +306,23 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
        else
                WARN_ON(bh != oinfo->dqi_gqi_bh);
        spin_unlock(&dq_data_lock);
+        if (ex) {
+                mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+                down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        } else {
+                down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        }
        return 0;
 }
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
+        if (ex) {
+                up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+                mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+        } else {
+                up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
+        }
        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
        brelse(oinfo->dqi_gqi_bh);
        spin_lock(&dq_data_lock);
@@ -313,6 +340,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        struct ocfs2_global_disk_dqinfo dinfo;
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        u64 pcount;
        int status;
        mlog_entry_void();
@@ -339,9 +367,19 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                mlog_errno(status);
                goto out_err;
        }
+        status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+                                             &pcount, NULL);
+        if (status < 0)
+                goto out_unlock;
+        status = ocfs2_qinfo_lock(oinfo, 0);
+        if (status < 0)
+                goto out_unlock;
        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
                                      sizeof(struct ocfs2_global_disk_dqinfo),
                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_qinfo_unlock(oinfo, 0);
        ocfs2_unlock_global_qf(oinfo, 0);
        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
@@ -368,6 +406,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 out_err:
        mlog_exit(status);
        return status;
+out_unlock:
+        ocfs2_unlock_global_qf(oinfo, 0);
+        mlog_errno(status);
+        goto out_err;
 }
 /* Write information to global quota file. Expects exlusive lock on quota
@@ -426,78 +468,10 @@ static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
 static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
 {
-        /* We modify all the allocated blocks, tree root, and info block */
+        /* We modify all the allocated blocks, tree root, info block and
+         * the inode */
        return (ocfs2_global_qinit_alloc(sb, type) + 2) *
-                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + 1;
-}
-/* Read in information from global quota file and acquire a reference to it.
- * dquot_acquire() has already started the transaction and locked quota file */
-int ocfs2_global_read_dquot(struct dquot *dquot)
-{
-        int err, err2, ex = 0;
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
-        struct inode *gqinode = info->dqi_gqinode;
-        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
-        handle_t *handle = NULL;
-        err = ocfs2_qinfo_lock(info, 0);
-        if (err < 0)
-                goto out;
-        err = qtree_read_dquot(&info->dqi_gi, dquot);
-        if (err < 0)
-                goto out_qlock;
-        OCFS2_DQUOT(dquot)->dq_use_count++;
-        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
-        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
-        ocfs2_qinfo_unlock(info, 0);
-        if (!dquot->dq_off) {   /* No real quota entry? */
-                ex = 1;
-                /*
-                 * Add blocks to quota file before we start a transaction since
-                 * locking allocators ranks above a transaction start
-                 */
-                WARN_ON(journal_current_handle());
-                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                err = ocfs2_extend_no_holes(gqinode,
-                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
-                        gqinode->i_size);
-                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                if (err < 0)
-                        goto out;
-        }
-        handle = ocfs2_start_trans(osb,
-                                   ocfs2_calc_global_qinit_credits(sb, type));
-        if (IS_ERR(handle)) {
-                err = PTR_ERR(handle);
-                goto out;
-        }
-        err = ocfs2_qinfo_lock(info, ex);
-        if (err < 0)
-                goto out_trans;
-        err = qtree_write_dquot(&info->dqi_gi, dquot);
-        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
-                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
-                if (!err)
-                        err = err2;
-        }
-out_qlock:
-        if (ex)
-                ocfs2_qinfo_unlock(info, 1);
-        else
-                ocfs2_qinfo_unlock(info, 0);
-out_trans:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-out:
-        if (err < 0)
-                mlog_errno(err);
-        return err;
 }
 /* Sync local information about quota modifications with global quota file.
@@ -638,14 +612,13 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
        }
        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        status = ocfs2_sync_dquot(dquot);
-        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        if (status < 0)
                mlog_errno(status);
        /* We have to write local structure as well... */
-        dquot_mark_dquot_dirty(dquot);
+        status = ocfs2_local_write_dquot(dquot);
-        status = dquot_commit(dquot);
        if (status < 0)
                mlog_errno(status);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
@@ -684,7 +657,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
                mlog_errno(status);
                goto out;
        }
-        status = dquot_commit(dquot);
+        mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+        status = ocfs2_local_write_dquot(dquot);
+        mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out:
        mlog_exit(status);
@@ -715,6 +690,10 @@ static int ocfs2_release_dquot(struct dquot *dquot)
        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        mutex_lock(&dquot->dq_lock);
+        /* Check whether we are not racing with some other dqget() */
+        if (atomic_read(&dquot->dq_count) > 1)
+                goto out;
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
@@ -725,30 +704,113 @@ static int ocfs2_release_dquot(struct dquot *dquot)
                mlog_errno(status);
                goto out_ilock;
        }
-        status = dquot_release(dquot);
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_local_release_dquot(handle, dquot);
+        /*
+         * If we fail here, we cannot do much as global structure is
+         * already released. So just complain...
+         */
+        if (status < 0)
+                mlog_errno(status);
+        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+out_trans:
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
+        mutex_unlock(&dquot->dq_lock);
        mlog_exit(status);
        return status;
 }
+/*
+ * Read global dquot structure from disk or create it if it does
+ * not exist. Also update use count of the global structure and
+ * create structure in node-local quota file.
+ */
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-        struct ocfs2_mem_dqinfo *oinfo =
+        int status = 0, err;
-                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        int ex = 0;
-        int status = 0;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = info->dqi_gqinode;
+        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+        handle_t *handle;
-        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-        /* We need an exclusive lock, because we're going to update use count
+        mutex_lock(&dquot->dq_lock);
-         * and instantiate possibly new dquot structure */
+        /*
-        status = ocfs2_lock_global_qf(oinfo, 1);
+         * We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure
+         */
+        status = ocfs2_lock_global_qf(info, 1);
        if (status < 0)
                goto out;
-        status = dquot_acquire(dquot);
+        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
-        ocfs2_unlock_global_qf(oinfo, 1);
+                status = ocfs2_qinfo_lock(info, 0);
+                if (status < 0)
+                        goto out_dq;
+                status = qtree_read_dquot(&info->dqi_gi, dquot);
+                ocfs2_qinfo_unlock(info, 0);
+                if (status < 0)
+                        goto out_dq;
+        }
+        set_bit(DQ_READ_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                ex = 1;
+                /*
+                 * Add blocks to quota file before we start a transaction since
+                 * locking allocators ranks above a transaction start
+                 */
+                WARN_ON(journal_current_handle());
+                status = ocfs2_extend_no_holes(gqinode,
+                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+                        gqinode->i_size);
+                if (status < 0)
+                        goto out_dq;
+        }
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_calc_global_qinit_credits(sb, type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto out_dq;
+        }
+        status = ocfs2_qinfo_lock(info, ex);
+        if (status < 0)
+                goto out_trans;
+        status = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(sb, type))) {
+                err = __ocfs2_global_write_info(sb, type);
+                if (!status)
+                        status = err;
+        }
+        ocfs2_qinfo_unlock(info, ex);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_dq:
+        ocfs2_unlock_global_qf(info, 1);
+        if (status < 0)
+                goto out;
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0)
+                goto out;
+        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
+        mutex_unlock(&dquot->dq_lock);
        mlog_exit(status);
        return status;
 }
@@ -770,7 +832,6 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
-        dquot_mark_dquot_dirty(dquot);
        /* In case user set some limits, sync dquot immediately to global
         * quota file so that information propagates quicker */
@@ -793,14 +854,16 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
                mlog_errno(status);
                goto out_ilock;
        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
        status = ocfs2_sync_dquot(dquot);
        if (status < 0) {
                mlog_errno(status);
-                goto out_trans;
+                goto out_dlock;
        }
        /* Now write updated local dquot structure */
-        status = dquot_commit(dquot);
+        status = ocfs2_local_write_dquot(dquot);
-out_trans:
+out_dlock:
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
@@ -852,7 +915,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 const struct dquot_operations ocfs2_quota_operations = {
-        .write_dquot    = ocfs2_write_dquot,
+        /* We never make dquot dirty so .write_dquot is never called */
        .acquire_dquot  = ocfs2_acquire_dquot,
        .release_dquot  = ocfs2_release_dquot,
        .mark_dirty     = ocfs2_mark_dquot_dirty,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 9ad49305f450..8bd70d4d184d 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -22,6 +22,7 @@
 #include "dlmglue.h"
 #include "quota.h"
 #include "uptodate.h"
+#include "super.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -119,12 +120,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        lock_buffer(bh);
        modify(bh, private);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                ocfs2_commit_trans(OCFS2_SB(sb), handle);
-                return status;
-        }
        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
        if (status < 0) {
                mlog_errno(status);
@@ -133,6 +130,39 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        return 0;
 }
+/*
+ * Read quota block from a given logical offset.
+ *
+ * This function acquires ip_alloc_sem and thus it must not be called with a
+ * transaction started.
+ */
+static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                                  struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+                ocfs2_error(inode->i_sb,
+                            "Quota file %llu is probably corrupted! Requested "
+                            "to read block %Lu but file has size only %Lu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)v_block,
+                            (unsigned long long)i_size_read(inode));
+                return -EIO;
+        }
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /* Check whether we understand format of quota files */
 static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 {
@@ -523,9 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
                        le32_add_cpu(&dchunk->dqc_free, 1);
                        unlock_buffer(qbh);
-                        status = ocfs2_journal_dirty(handle, qbh);
+                        ocfs2_journal_dirty(handle, qbh);
-                        if (status < 0)
-                                mlog_errno(status);
 out_commit:
                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -631,9 +659,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                lock_buffer(bh);
                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
                unlock_buffer(bh);
-                status = ocfs2_journal_dirty(handle, bh);
+                ocfs2_journal_dirty(handle, bh);
-                if (status < 0)
-                        mlog_errno(status);
 out_trans:
                ocfs2_commit_trans(osb, handle);
 out_bh:
@@ -679,7 +705,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        INIT_LIST_HEAD(&oinfo->dqi_chunk);
        oinfo->dqi_rec = NULL;
        oinfo->dqi_lqi_bh = NULL;
-        oinfo->dqi_ibh = NULL;
+        oinfo->dqi_libh = NULL;
        status = ocfs2_global_read_info(sb, type);
        if (status < 0)
@@ -705,7 +731,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
-        oinfo->dqi_ibh = bh;
+        oinfo->dqi_libh = bh;
        /* We crashed when using local quota file? */
        if (!(info->dqi_flags & OLQF_CLEAN)) {
@@ -767,7 +793,7 @@ static int ocfs2_local_write_info(struct super_block *sb, int type)
 {
        struct mem_dqinfo *info = sb_dqinfo(sb, type);
        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
-                                                ->dqi_ibh;
+                                                ->dqi_libh;
        int status;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
@@ -790,10 +816,6 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
        int mark_clean = 1, len;
        int status;
-        /* At this point we know there are no more dquots and thus
-         * even if there's some sync in the pdflush queue, it won't
-         * find any dquots and return without doing anything */
-        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
        iput(oinfo->dqi_gqinode);
        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
@@ -828,7 +850,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
        /* Mark local file as clean */
        info->dqi_flags |= OLQF_CLEAN;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
-                                 oinfo->dqi_ibh,
+                                 oinfo->dqi_libh,
                                 olq_update_info,
                                 info);
        if (status < 0) {
@@ -838,7 +860,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 out:
        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
-        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_libh);
        brelse(oinfo->dqi_lqi_bh);
        kfree(oinfo);
        return 0;
@@ -866,22 +888,21 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 }
 /* Write dquot to local quota file */
-static int ocfs2_local_write_dquot(struct dquot *dquot)
+int ocfs2_local_write_dquot(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh;
+        struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_type];
        int status;
-        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+        status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk,
-                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                             &bh);
-                                    &bh);
        if (status) {
                mlog_errno(status);
                goto out;
        }
-        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+        status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od);
-                                 olq_set_dquot, od);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -981,10 +1002,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        }
        /* Initialize chunk header */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -1009,17 +1028,11 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Initialize new block with structures */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -1040,11 +1053,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        lock_buffer(dbh);
        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
        unlock_buffer(dbh);
-        status = ocfs2_journal_dirty(handle, dbh);
+        ocfs2_journal_dirty(handle, dbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
@@ -1120,10 +1129,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        }
        /* Get buffer from the just added block */
-        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
-        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
                goto out;
@@ -1155,11 +1162,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(bh);
        memset(bh->b_data, 0, sb->s_blocksize);
        unlock_buffer(bh);
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
                                         chunk->qc_headerbh,
@@ -1173,11 +1177,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        lock_buffer(chunk->qc_headerbh);
        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
        unlock_buffer(chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_trans;
-        }
        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1210,7 +1211,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
 }
 /* Create dquot in the local file for given id */
-static int ocfs2_create_local_dquot(struct dquot *dquot)
+int ocfs2_create_local_dquot(struct dquot *dquot)
 {
        struct super_block *sb = dquot->dq_sb;
        int type = dquot->dq_type;
@@ -1219,17 +1220,27 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
        int offset;
        int status;
+        u64 pcount;
+        down_write(&OCFS2_I(lqinode)->ip_alloc_sem);
        chunk = ocfs2_find_free_entry(sb, type, &offset);
        if (!chunk) {
                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
-                if (IS_ERR(chunk))
+                if (IS_ERR(chunk)) {
-                        return PTR_ERR(chunk);
+                        status = PTR_ERR(chunk);
+                        goto out;
+                }
        } else if (IS_ERR(chunk)) {
-                return PTR_ERR(chunk);
+                status = PTR_ERR(chunk);
+                goto out;
        }
        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
        od->dq_chunk = chunk;
+        status = ocfs2_extent_map_get_blocks(lqinode,
+                                     ol_dqblk_block(sb, chunk->qc_num, offset),
+                                     &od->dq_local_phys_blk,
+                                     &pcount,
+                                     NULL);
        /* Initialize dquot structure on disk */
        status = ocfs2_local_write_dquot(dquot);
@@ -1246,39 +1257,15 @@ static int ocfs2_create_local_dquot(struct dquot *dquot)
                goto out;
        }
 out:
+        up_write(&OCFS2_I(lqinode)->ip_alloc_sem);
        return status;
 }
-/* Create entry in local file for dquot, load data from the global file */
+/*
-static int ocfs2_local_read_dquot(struct dquot *dquot)
+ * Release dquot structure from local quota file. ocfs2_release_dquot() has
-{
+ * already started a transaction and written all changes to global quota file
-        int status;
+ */
+int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
-        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
-        status = ocfs2_global_read_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_err;
-        }
-        /* Now create entry in the local quota file */
-        status = ocfs2_create_local_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_err;
-        }
-        mlog_exit(0);
-        return 0;
-out_err:
-        mlog_exit(status);
-        return status;
-}
-/* Release dquot structure from local quota file. ocfs2_release_dquot() has
- * already started a transaction and obtained exclusive lock for global
- * quota file. */
-static int ocfs2_local_release_dquot(struct dquot *dquot)
 {
        int status;
        int type = dquot->dq_type;
@@ -1286,15 +1273,6 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        struct super_block *sb = dquot->dq_sb;
        struct ocfs2_local_disk_chunk *dchunk;
        int offset;
-        handle_t *handle = journal_current_handle();
-        BUG_ON(!handle);
-        /* First write all local changes to global file */
-        status = ocfs2_global_release_dquot(dquot);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
        status = ocfs2_journal_access_dq(handle,
                        INODE_CACHE(sb_dqopt(sb)->files[type]),
@@ -1312,12 +1290,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
        le32_add_cpu(&dchunk->dqc_free, 1);
        unlock_buffer(od->dq_chunk->qc_headerbh);
-        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out;
-        }
-        status = 0;
 out:
        /* Clear the read bit so that next time someone uses this
         * dquot he reads fresh info from disk and allocates local
@@ -1331,9 +1305,6 @@ static const struct quota_format_ops ocfs2_format_ops = {
        .read_file_info         = ocfs2_local_read_info,
        .write_file_info        = ocfs2_global_write_info,
        .free_file_info         = ocfs2_local_free_info,
-        .read_dqblk             = ocfs2_local_read_dquot,
-        .commit_dqblk           = ocfs2_local_write_dquot,
-        .release_dqblk          = ocfs2_local_release_dquot,
 };
 struct quota_format_type ocfs2_quota_format = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5cbcd0f008fc..4793f36f6518 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
@@ -596,7 +596,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
                goto out_commit;
        }
-        ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &first_blkno);
        if (ret) {
@@ -626,6 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        memset(rb, 0, inode->i_sb->s_blocksize);
        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
        rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -790,7 +791,10 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
        if (le32_to_cpu(rb->rf_count) == 1) {
                blk = le64_to_cpu(rb->rf_blkno);
                bit = le16_to_cpu(rb->rf_suballoc_bit);
-                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+                if (rb->rf_suballoc_loc)
+                        bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
+                else
+                        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
                alloc_inode = ocfs2_get_system_file_inode(osb,
                                        EXTENT_ALLOC_SYSTEM_INODE,
@@ -1268,9 +1272,7 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
        } else if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        return ret;
 }
@@ -1284,7 +1286,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_refcount_block *new_rb;
@@ -1298,7 +1300,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1330,6 +1332,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_blkno = cpu_to_le64(blkno);
        new_rb->rf_cpos = cpu_to_le32(0);
@@ -1524,7 +1527,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got, new_cpos;
-        u64 blkno;
+        u64 suballoc_loc, blkno;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *root_rb =
                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
@@ -1548,7 +1551,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
                goto out;
        }
-        ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
                                   &suballoc_bit_start, &num_got,
                                   &blkno);
        if (ret) {
@@ -1576,6 +1579,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        memset(new_rb, 0, sb->s_blocksize);
        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
        new_rb->rf_blkno = cpu_to_le64(blkno);
@@ -1694,7 +1698,7 @@ static int ocfs2_adjust_refcount_rec(handle_t *handle,
         * 2 more credits, one for the leaf refcount block, one for
         * the extent block contains the extent rec.
         */
-        ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
+        ret = ocfs2_extend_trans(handle, 2);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1802,11 +1806,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
        if (merge)
                ocfs2_refcount_rec_merge(rb, index);
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (index == 0) {
                ret = ocfs2_adjust_refcount_rec(handle, ci,
@@ -1977,9 +1977,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                        ocfs2_refcount_rec_merge(rb, index);
        }
-        ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+        ocfs2_journal_dirty(handle, ref_leaf_bh);
-        if (ret)
-                mlog_errno(ret);
 out:
        brelse(new_bh);
@@ -2112,6 +2110,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
         */
        ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
                                        le16_to_cpu(rb->rf_suballoc_slot),
+                                        le64_to_cpu(rb->rf_suballoc_loc),
                                        le64_to_cpu(rb->rf_blkno),
                                        le16_to_cpu(rb->rf_suballoc_bit));
        if (ret) {
@@ -2516,20 +2515,19 @@ out:
 *
 * Normally the refcount blocks store these refcount should be
 * contiguous also, so that we can get the number easily.
- * As for meta_ac, we will at most add split 2 refcount record and
+ * We will at most add split 2 refcount records and 2 more
- * 2 more refcount block, so just check it in a rough way.
+ * refcount blocks, so just check it in a rough way.
 *
 * Caller must hold refcount tree lock.
 */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac)
+                                          int *ref_blocks)
 {
-        int ret, ref_blocks = 0;
+        int ret;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *tree;
@@ -2546,14 +2544,13 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-                                      le64_to_cpu(di->i_refcount_loc), &tree);
+                                      refcount_loc, &tree);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_refcount_block(&tree->rf_ci,
+        ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
-                                        le64_to_cpu(di->i_refcount_loc),
                                        &ref_root_bh);
        if (ret) {
                mlog_errno(ret);
@@ -2564,21 +2561,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                                               &tree->rf_ci,
                                               ref_root_bh,
                                               start_cpos, clusters,
-                                               &ref_blocks, credits);
+                                               ref_blocks, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        mlog(0, "reserve new metadata %d, credits = %d\n",
+        mlog(0, "reserve new metadata %d blocks, credits = %d\n",
-             ref_blocks, *credits);
+             *ref_blocks, *credits);
-        if (ref_blocks) {
-                ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-                                                        ref_blocks, meta_ac);
-                if (ret)
-                        mlog_errno(ret);
-        }
 out:
        brelse(ref_root_bh);
@@ -3040,11 +3030,7 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
                }
                memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
-                ret = ocfs2_journal_dirty(handle, new_bh);
+                ocfs2_journal_dirty(handle, new_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
                brelse(new_bh);
                brelse(old_bh);
@@ -3282,7 +3268,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                } else {
                        delete = 1;
-                        ret = __ocfs2_claim_clusters(osb, handle,
+                        ret = __ocfs2_claim_clusters(handle,
                                                     context->data_ac,
                                                     1, set_len,
                                                     &new_bit, &new_len);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index c1d19b1d3ecc..9983ba1570e2 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,11 +47,11 @@ int ocfs2_decrease_refcount(struct inode *inode,
                            struct ocfs2_cached_dealloc_ctxt *dealloc,
                            int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
-                                          struct buffer_head *di_bh,
+                                          u64 refcount_loc,
                                          u64 phys_blkno,
                                          u32 clusters,
                                          int *credits,
-                                          struct ocfs2_alloc_context **meta_ac);
+                                          int *ref_blocks);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
                       u32 cpos, u32 write_len, u32 max_cpos);
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
new file mode 100644
index 000000000000..40650021fc24
--- /dev/null
+++ b/fs/ocfs2/reservations.c
@@ -0,0 +1,847 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.c
+ *
+ * Allocation reservations implementation
+ *
+ * Some code borrowed from fs/ext3/balloc.c and is:
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * The rest is copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#define MLOG_MASK_PREFIX ML_RESERVATIONS
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#ifdef CONFIG_OCFS2_DEBUG_FS
+#define OCFS2_CHECK_RESERVATIONS
+#endif
+DEFINE_SPINLOCK(resv_lock);
+#define OCFS2_MIN_RESV_WINDOW_BITS      8
+#define OCFS2_MAX_RESV_WINDOW_BITS      1024
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+        return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
+static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap,
+                                           struct ocfs2_alloc_reservation *resv)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        unsigned int bits;
+        if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
+                /* 8, 16, 32, 64, 128, 256, 512, 1024 */
+                bits = 4 << osb->osb_resv_level;
+        } else {
+                bits = 4 << osb->osb_dir_resv_level;
+        }
+        return bits;
+}
+static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_len)
+                return resv->r_start + resv->r_len - 1;
+        return resv->r_start;
+}
+static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv)
+{
+        return !!(resv->r_len == 0);
+}
+static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap)
+{
+        if (resmap->m_osb->osb_resv_level == 0)
+                return 1;
+        return 0;
+}
+static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct ocfs2_super *osb = resmap->m_osb;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        int i = 0;
+        mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n",
+             osb->dev_str, resmap->m_bitmap_len);
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u"
+                     "\tlast_len: %u\n", resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                node = rb_next(node);
+                i++;
+        }
+        mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i);
+        i = 0;
+        list_for_each_entry(resv, &resmap->m_lru, r_lru) {
+                mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t"
+                     "last_start: %u\tlast_len: %u\n", i, resv->r_start,
+                     ocfs2_resv_end(resv), resv->r_len, resv->r_last_start,
+                     resv->r_last_len);
+                i++;
+        }
+}
+#ifdef OCFS2_CHECK_RESERVATIONS
+static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap,
+                                      int i,
+                                      struct ocfs2_alloc_reservation *resv)
+{
+        char *disk_bitmap = resmap->m_disk_bitmap;
+        unsigned int start = resv->r_start;
+        unsigned int end = ocfs2_resv_end(resv);
+        while (start <= end) {
+                if (ocfs2_test_bit(start, disk_bitmap)) {
+                        mlog(ML_ERROR,
+                             "reservation %d covers an allocated area "
+                             "starting at bit %u!\n", i, start);
+                        return 1;
+                }
+                start++;
+        }
+        return 0;
+}
+static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+        unsigned int off = 0;
+        int i = 0;
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (i > 0 && resv->r_start <= off) {
+                        mlog(ML_ERROR, "reservation %d has bad start off!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_len == 0) {
+                        mlog(ML_ERROR, "reservation %d has no length!\n",
+                             i);
+                        goto bad;
+                }
+                if (resv->r_start > ocfs2_resv_end(resv)) {
+                        mlog(ML_ERROR, "reservation %d has invalid range!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) {
+                        mlog(ML_ERROR, "reservation %d extends past bitmap!\n",
+                             i);
+                        goto bad;
+                }
+                if (ocfs2_validate_resmap_bits(resmap, i, resv))
+                        goto bad;
+                off = ocfs2_resv_end(resv);
+                node = rb_next(node);
+                i++;
+        }
+        return;
+bad:
+        ocfs2_dump_resv(resmap);
+        BUG();
+}
+#else
+static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap)
+{
+}
+#endif
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv)
+{
+        memset(resv, 0, sizeof(*resv));
+        INIT_LIST_HEAD(&resv->r_lru);
+}
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags)
+{
+        BUG_ON(flags & ~OCFS2_RESV_TYPES);
+        resv->r_flags |= flags;
+}
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap)
+{
+        memset(resmap, 0, sizeof(*resmap));
+        resmap->m_osb = osb;
+        resmap->m_reservations = RB_ROOT;
+        /* m_bitmap_len is initialized to zero by the above memset. */
+        INIT_LIST_HEAD(&resmap->m_lru);
+        return 0;
+}
+static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap,
+                                struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        if (!list_empty(&resv->r_lru))
+                list_del_init(&resv->r_lru);
+        list_add_tail(&resv->r_lru, &resmap->m_lru);
+}
+static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv)
+{
+        resv->r_len = 0;
+        resv->r_start = 0;
+}
+static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *resv)
+{
+        if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) {
+                list_del_init(&resv->r_lru);
+                rb_erase(&resv->r_node, &resmap->m_reservations);
+                resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE;
+        }
+}
+static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                                 struct ocfs2_alloc_reservation *resv)
+{
+        assert_spin_locked(&resv_lock);
+        __ocfs2_resv_trunc(resv);
+        /*
+         * last_len and last_start no longer make sense if
+         * we're changing the range of our allocations.
+         */
+        resv->r_last_len = resv->r_last_start = 0;
+        ocfs2_resv_remove(resmap, resv);
+}
+/* does nothing if 'resv' is null */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv)
+{
+        if (resv) {
+                spin_lock(&resv_lock);
+                __ocfs2_resv_discard(resmap, resv);
+                spin_unlock(&resv_lock);
+        }
+}
+static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap)
+{
+        struct rb_node *node;
+        struct ocfs2_alloc_reservation *resv;
+        assert_spin_locked(&resv_lock);
+        while ((node = rb_last(&resmap->m_reservations)) != NULL) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                __ocfs2_resv_discard(resmap, resv);
+        }
+}
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap)
+{
+        if (ocfs2_resmap_disabled(resmap))
+                return;
+        spin_lock(&resv_lock);
+        ocfs2_resmap_clear_all_resv(resmap);
+        resmap->m_bitmap_len = clen;
+        resmap->m_disk_bitmap = disk_bitmap;
+        spin_unlock(&resv_lock);
+}
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap)
+{
+        /* Does nothing for now. Keep this around for API symmetry */
+}
+static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
+                              struct ocfs2_alloc_reservation *new)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        struct rb_node *parent = NULL;
+        struct rb_node **p = &root->rb_node;
+        struct ocfs2_alloc_reservation *tmp;
+        assert_spin_locked(&resv_lock);
+        mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+             new->r_len);
+        while (*p) {
+                parent = *p;
+                tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node);
+                if (new->r_start < tmp->r_start) {
+                        p = &(*p)->rb_left;
+                        /*
+                         * This is a good place to check for
+                         * overlapping reservations.
+                         */
+                        BUG_ON(ocfs2_resv_end(new) >= tmp->r_start);
+                } else if (new->r_start > ocfs2_resv_end(tmp)) {
+                        p = &(*p)->rb_right;
+                } else {
+                        /* This should never happen! */
+                        mlog(ML_ERROR, "Duplicate reservation window!\n");
+                        BUG();
+                }
+        }
+        rb_link_node(&new->r_node, parent, p);
+        rb_insert_color(&new->r_node, root);
+        new->r_flags |= OCFS2_RESV_FLAG_INUSE;
+        ocfs2_resv_mark_lru(resmap, new);
+        ocfs2_check_resmap(resmap);
+}
+/**
+ * ocfs2_find_resv_lhs() - find the window which contains goal
+ * @resmap: reservation map to search
+ * @goal: which bit to search for
+ *
+ * If a window containing that goal is not found, we return the window
+ * which comes before goal. Returns NULL on empty rbtree or no window
+ * before goal.
+ */
+static struct ocfs2_alloc_reservation *
+ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal)
+{
+        struct ocfs2_alloc_reservation *resv = NULL;
+        struct ocfs2_alloc_reservation *prev_resv = NULL;
+        struct rb_node *node = resmap->m_reservations.rb_node;
+        assert_spin_locked(&resv_lock);
+        if (!node)
+                return NULL;
+        node = rb_first(&resmap->m_reservations);
+        while (node) {
+                resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node);
+                if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal)
+                        break;
+                /* Check if we overshot the reservation just before goal? */
+                if (resv->r_start > goal) {
+                        resv = prev_resv;
+                        break;
+                }
+                prev_resv = resv;
+                node = rb_next(node);
+        }
+        return resv;
+}
+/*
+ * We are given a range within the bitmap, which corresponds to a gap
+ * inside the reservations tree (search_start, search_len). The range
+ * can be anything from the whole bitmap, to a gap between
+ * reservations.
+ *
+ * The start value of *rstart is insignificant.
+ *
+ * This function searches the bitmap range starting at search_start
+ * with length search_len for a set of contiguous free bits. We try
+ * to find up to 'wanted' bits, but can sometimes return less.
+ *
+ * Returns the length of allocation, 0 if no free bits are found.
+ *
+ * *cstart and *clen will also be populated with the result.
+ */
+static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
+                                       unsigned int wanted,
+                                       unsigned int search_start,
+                                       unsigned int search_len,
+                                       unsigned int *rstart,
+                                       unsigned int *rlen)
+{
+        void *bitmap = resmap->m_disk_bitmap;
+        unsigned int best_start, best_len = 0;
+        int offset, start, found;
+        mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+             wanted, search_start, search_len, resmap->m_bitmap_len);
+        found = best_start = best_len = 0;
+        start = search_start;
+        while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len,
+                                                 start)) != -1) {
+                /* Search reached end of the region */
+                if (offset >= (search_start + search_len))
+                        break;
+                if (offset == start) {
+                        /* we found a zero */
+                        found++;
+                        /* move start to the next bit to test */
+                        start++;
+                } else {
+                        /* got a zero after some ones */
+                        found = 1;
+                        start = offset + 1;
+                }
+                if (found > best_len) {
+                        best_len = found;
+                        best_start = start - found;
+                }
+                if (found >= wanted)
+                        break;
+        }
+        if (best_len == 0)
+                return 0;
+        if (best_len >= wanted)
+                best_len = wanted;
+        *rlen = best_len;
+        *rstart = best_start;
+        mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+        return *rlen;
+}
+static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int goal, unsigned int wanted)
+{
+        struct rb_root *root = &resmap->m_reservations;
+        unsigned int gap_start, gap_end, gap_len;
+        struct ocfs2_alloc_reservation *prev_resv, *next_resv;
+        struct rb_node *prev, *next;
+        unsigned int cstart, clen;
+        unsigned int best_start = 0, best_len = 0;
+        /*
+         * Nasty cases to consider:
+         *
+         * - rbtree is empty
+         * - our window should be first in all reservations
+         * - our window should be last in all reservations
+         * - need to make sure we don't go past end of bitmap
+         */
+        mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+        assert_spin_locked(&resv_lock);
+        if (RB_EMPTY_ROOT(root)) {
+                /*
+                 * Easiest case - empty tree. We can just take
+                 * whatever window of free bits we want.
+                 */
+                mlog(0, "Empty root\n");
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   resmap->m_bitmap_len - goal,
+                                                   &cstart, &clen);
+                /*
+                 * This should never happen - the local alloc window
+                 * will always have free bits when we're called.
+                 */
+                BUG_ON(goal == 0 && clen == 0);
+                if (clen == 0)
+                        return;
+                resv->r_start = cstart;
+                resv->r_len = clen;
+                ocfs2_resv_insert(resmap, resv);
+                return;
+        }
+        prev_resv = ocfs2_find_resv_lhs(resmap, goal);
+        if (prev_resv == NULL) {
+                mlog(0, "Goal on LHS of leftmost window\n");
+                /*
+                 * A NULL here means that the search code couldn't
+                 * find a window that starts before goal.
+                 *
+                 * However, we can take the first window after goal,
+                 * which is also by definition, the leftmost window in
+                 * the entire tree. If we can find free bits in the
+                 * gap between goal and the LHS window, then the
+                 * reservation can safely be placed there.
+                 *
+                 * Otherwise we fall back to a linear search, checking
+                 * the gaps in between windows for a place to
+                 * allocate.
+                 */
+                next = rb_first(root);
+                next_resv = rb_entry(next, struct ocfs2_alloc_reservation,
+                                     r_node);
+                /*
+                 * The search should never return such a window. (see
+                 * comment above
+                 */
+                if (next_resv->r_start <= goal) {
+                        mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n",
+                             goal, next_resv->r_start, next_resv->r_len);
+                        ocfs2_dump_resv(resmap);
+                        BUG();
+                }
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
+                                                   next_resv->r_start - goal,
+                                                   &cstart, &clen);
+                if (clen) {
+                        best_len = clen;
+                        best_start = cstart;
+                        if (best_len == wanted)
+                                goto out_insert;
+                }
+                prev_resv = next_resv;
+                next_resv = NULL;
+        }
+        prev = &prev_resv->r_node;
+        /* Now we do a linear search for a window, starting at 'prev_rsv' */
+        while (1) {
+                next = rb_next(prev);
+                if (next) {
+                        mlog(0, "One more resv found in linear search\n");
+                        next_resv = rb_entry(next,
+                                             struct ocfs2_alloc_reservation,
+                                             r_node);
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_end = next_resv->r_start - 1;
+                        gap_len = gap_end - gap_start + 1;
+                } else {
+                        mlog(0, "No next node\n");
+                        /*
+                         * We're at the rightmost edge of the
+                         * tree. See if a reservation between this
+                         * window and the end of the bitmap will work.
+                         */
+                        gap_start = ocfs2_resv_end(prev_resv) + 1;
+                        gap_len = resmap->m_bitmap_len - gap_start;
+                        gap_end = resmap->m_bitmap_len - 1;
+                }
+                /*
+                 * No need to check this gap if we have already found
+                 * a larger region of free bits.
+                 */
+                if (gap_len <= best_len)
+                        goto next_resv;
+                clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start,
+                                                   gap_len, &cstart, &clen);
+                if (clen == wanted) {
+                        best_len = clen;
+                        best_start = cstart;
+                        goto out_insert;
+                } else if (clen > best_len) {
+                        best_len = clen;
+                        best_start = cstart;
+                }
+next_resv:
+                if (!next)
+                        break;
+                prev = next;
+                prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation,
+                                     r_node);
+        }
+out_insert:
+        if (best_len) {
+                resv->r_start = best_start;
+                resv->r_len = best_len;
+                ocfs2_resv_insert(resmap, resv);
+        }
+}
+static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        struct ocfs2_alloc_reservation *lru_resv;
+        int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP);
+        unsigned int min_bits;
+        if (!tmpwindow)
+                min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1;
+        else
+                min_bits = wanted; /* We at know the temp window will use all
+                                    * of these bits */
+        /*
+         * Take the first reservation off the LRU as our 'target'. We
+         * don't try to be smart about it. There might be a case for
+         * searching based on size but I don't have enough data to be
+         * sure. --Mark (3/16/2010)
+         */
+        lru_resv = list_first_entry(&resmap->m_lru,
+                                    struct ocfs2_alloc_reservation, r_lru);
+        mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+             lru_resv->r_len, ocfs2_resv_end(lru_resv));
+        /*
+         * Cannibalize (some or all) of the target reservation and
+         * feed it to the current window.
+         */
+        if (lru_resv->r_len <= min_bits) {
+                /*
+                 * Discard completely if size is less than or equal to a
+                 * reasonable threshold - 50% of window bits for non temporary
+                 * windows.
+                 */
+                resv->r_start = lru_resv->r_start;
+                resv->r_len = lru_resv->r_len;
+                __ocfs2_resv_discard(resmap, lru_resv);
+        } else {
+                unsigned int shrink;
+                if (tmpwindow)
+                        shrink = min_bits;
+                else
+                        shrink = lru_resv->r_len / 2;
+                lru_resv->r_len -= shrink;
+                resv->r_start = ocfs2_resv_end(lru_resv) + 1;
+                resv->r_len = shrink;
+        }
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_resv_insert(resmap, resv);
+}
+static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
+                                   struct ocfs2_alloc_reservation *resv,
+                                   unsigned int wanted)
+{
+        unsigned int goal = 0;
+        BUG_ON(!ocfs2_resv_empty(resv));
+        /*
+         * Begin by trying to get a window as close to the previous
+         * one as possible. Using the most recent allocation as a
+         * start goal makes sense.
+         */
+        if (resv->r_last_len) {
+                goal = resv->r_last_start + resv->r_last_len;
+                if (goal >= resmap->m_bitmap_len)
+                        goal = 0;
+        }
+        __ocfs2_resv_find_window(resmap, resv, goal, wanted);
+        /* Search from last alloc didn't work, try once more from beginning. */
+        if (ocfs2_resv_empty(resv) && goal != 0)
+                __ocfs2_resv_find_window(resmap, resv, 0, wanted);
+        if (ocfs2_resv_empty(resv)) {
+                /*
+                 * Still empty? Pull oldest one off the LRU, remove it from
+                 * tree, put this one in it's place.
+                 */
+                ocfs2_cannibalize_resv(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+}
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen)
+{
+        unsigned int wanted = *clen;
+        if (resv == NULL || ocfs2_resmap_disabled(resmap))
+                return -ENOSPC;
+        spin_lock(&resv_lock);
+        /*
+         * We don't want to over-allocate for temporary
+         * windows. Otherwise, we run the risk of fragmenting the
+         * allocation space.
+         */
+        wanted = ocfs2_resv_window_bits(resmap, resv);
+        if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
+                wanted = *clen;
+        if (ocfs2_resv_empty(resv)) {
+                mlog(0, "empty reservation, find new window\n");
+                /*
+                 * Try to get a window here. If it works, we must fall
+                 * through and test the bitmap . This avoids some
+                 * ping-ponging of windows due to non-reserved space
+                 * being allocation before we initialize a window for
+                 * that inode.
+                 */
+                ocfs2_resv_find_window(resmap, resv, wanted);
+        }
+        BUG_ON(ocfs2_resv_empty(resv));
+        *cstart = resv->r_start;
+        *clen = resv->r_len;
+        spin_unlock(&resv_lock);
+        return 0;
+}
+static void
+        ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap,
+                                     struct ocfs2_alloc_reservation *resv,
+                                     unsigned int start, unsigned int end)
+{
+        unsigned int rhs = 0;
+        unsigned int old_end = ocfs2_resv_end(resv);
+        BUG_ON(start != resv->r_start || old_end < end);
+        /*
+         * Completely used? We can remove it then.
+         */
+        if (old_end == end) {
+                __ocfs2_resv_discard(resmap, resv);
+                return;
+        }
+        rhs = old_end - end;
+        /*
+         * This should have been trapped above.
+         */
+        BUG_ON(rhs == 0);
+        resv->r_start = end + 1;
+        resv->r_len = old_end - resv->r_start + 1;
+}
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen)
+{
+        unsigned int cend = cstart + clen - 1;
+        if (resmap == NULL || ocfs2_resmap_disabled(resmap))
+                return;
+        if (resv == NULL)
+                return;
+        BUG_ON(cstart != resv->r_start);
+        spin_lock(&resv_lock);
+        mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+             "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+             cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+             resv->r_len, resv->r_last_start, resv->r_last_len);
+        BUG_ON(cstart < resv->r_start);
+        BUG_ON(cstart > ocfs2_resv_end(resv));
+        BUG_ON(cend > ocfs2_resv_end(resv));
+        ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend);
+        resv->r_last_start = cstart;
+        resv->r_last_len = clen;
+        /*
+         * May have been discarded above from
+         * ocfs2_adjust_resv_from_alloc().
+         */
+        if (!ocfs2_resv_empty(resv))
+                ocfs2_resv_mark_lru(resmap, resv);
+        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+             "r_len: %u r_last_start: %u r_last_len: %u\n",
+             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+             resv->r_last_start, resv->r_last_len);
+        ocfs2_check_resmap(resmap);
+        spin_unlock(&resv_lock);
+}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
new file mode 100644
index 000000000000..1e49cc29d06c
--- /dev/null
+++ b/fs/ocfs2/reservations.h
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * reservations.h
+ *
+ * Allocation reservations function prototypes and structures.
+ *
+ * Copyright (C) 2010 Novell.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_RESERVATIONS_H
+#define OCFS2_RESERVATIONS_H
+#include <linux/rbtree.h>
+#define OCFS2_DEFAULT_RESV_LEVEL        2
+#define OCFS2_MAX_RESV_LEVEL    9
+#define OCFS2_MIN_RESV_LEVEL    0
+struct ocfs2_alloc_reservation {
+        struct rb_node  r_node;
+        unsigned int    r_start;        /* Begining of current window */
+        unsigned int    r_len;          /* Length of the window */
+        unsigned int    r_last_len;     /* Length of most recent alloc */
+        unsigned int    r_last_start;   /* Start of most recent alloc */
+        struct list_head        r_lru;  /* LRU list head */
+        unsigned int    r_flags;
+};
+#define OCFS2_RESV_FLAG_INUSE   0x01    /* Set when r_node is part of a btree */
+#define OCFS2_RESV_FLAG_TMP     0x02    /* Temporary reservation, will be
+                                         * destroyed immedately after use */
+#define OCFS2_RESV_FLAG_DIR     0x04    /* Reservation is for an unindexed
+                                         * directory btree */
+struct ocfs2_reservation_map {
+        struct rb_root          m_reservations;
+        char                    *m_disk_bitmap;
+        struct ocfs2_super      *m_osb;
+        /* The following are not initialized to meaningful values until a disk
+         * bitmap is provided. */
+        u32                     m_bitmap_len;   /* Number of valid
+                                                 * bits available */
+        struct list_head        m_lru;          /* LRU of reservations
+                                                 * structures. */
+};
+void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv);
+#define OCFS2_RESV_TYPES        (OCFS2_RESV_FLAG_TMP|OCFS2_RESV_FLAG_DIR)
+void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
+                         unsigned int flags);
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+/**
+ * ocfs2_resv_discard() - truncate a reservation
+ * @resmap:
+ * @resv: the reservation to truncate.
+ *
+ * After this function is called, the reservation will be empty, and
+ * unlinked from the rbtree.
+ */
+void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap,
+                        struct ocfs2_alloc_reservation *resv);
+/**
+ * ocfs2_resmap_init() - Initialize fields of a reservations bitmap
+ * @resmap: struct ocfs2_reservation_map to initialize
+ * @obj: unused for now
+ * @ops: unused for now
+ * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize)
+ *
+ * Only possible return value other than '0' is -ENOMEM for failure to
+ * allocation mirror bitmap.
+ */
+int ocfs2_resmap_init(struct ocfs2_super *osb,
+                      struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_restart() - "restart" a reservation bitmap
+ * @resmap: reservations bitmap
+ * @clen: Number of valid bits in the bitmap
+ * @disk_bitmap: the disk bitmap this resmap should refer to.
+ *
+ * Re-initialize the parameters of a reservation bitmap. This is
+ * useful for local alloc window slides.
+ *
+ * This function will call ocfs2_trunc_resv against all existing
+ * reservations. A future version will recalculate existing
+ * reservations based on the new bitmap.
+ */
+void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap,
+                          unsigned int clen, char *disk_bitmap);
+/**
+ * ocfs2_resmap_uninit() - uninitialize a reservation bitmap structure
+ * @resmap: the struct ocfs2_reservation_map to uninitialize
+ */
+void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap);
+/**
+ * ocfs2_resmap_resv_bits() - Return still-valid reservation bits
+ * @resmap: reservations bitmap
+ * @resv: reservation to base search from
+ * @cstart: start of proposed allocation
+ * @clen: length (in clusters) of proposed allocation
+ *
+ * Using the reservation data from resv, this function will compare
+ * resmap and resmap->m_disk_bitmap to determine what part (if any) of
+ * the reservation window is still clear to use. If resv is empty,
+ * this function will try to allocate a window for it.
+ *
+ * On success, zero is returned and the valid allocation area is set in cstart
+ * and clen.
+ *
+ * Returns -ENOSPC if reservations are disabled.
+ */
+int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
+                           struct ocfs2_alloc_reservation *resv,
+                           int *cstart, int *clen);
+/**
+ * ocfs2_resmap_claimed_bits() - Tell the reservation code that bits were used.
+ * @resmap: reservations bitmap
+ * @resv: optional reservation to recalulate based on new bitmap
+ * @cstart: start of allocation in clusters
+ * @clen: end of allocation in clusters.
+ *
+ * Tell the reservation code that bits were used to fulfill allocation in
+ * resmap. The bits don't have to have been part of any existing
+ * reservation. But we must always call this function when bits are claimed.
+ * Internally, the reservations code will use this information to mark the
+ * reservations bitmap. If resv is passed, it's next allocation window will be
+ * calculated. It also expects that 'cstart' is the same as we passed back
+ * from ocfs2_resmap_resv_bits().
+ */
+void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
+                               struct ocfs2_alloc_reservation *resv,
+                               u32 cstart, u32 clen);
+#endif  /* OCFS2_RESERVATIONS_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 3c3d673a4d20..dacd553d8617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -134,11 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
                le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
        }
-        ret = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_rollback;
-        }
        /* update the inode accordingly. */
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
@@ -319,7 +315,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
                     "Force to do offline resize.");
                ret = -EINVAL;
@@ -500,7 +497,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
-                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
+                ocfs2_group_bitmap_size(osb->sb, 0,
+                                        osb->s_feature_incompat) * 8) {
                mlog(ML_ERROR, "The disk is too old and small."
                     " Force to do offline resize.");
                ret = -EINVAL;
@@ -545,12 +543,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        group = (struct ocfs2_group_desc *)group_bh->b_data;
        group->bg_next_group = cr->c_blkno;
+        ocfs2_journal_dirty(handle, group_bh);
-        ret = ocfs2_journal_dirty(handle, group_bh);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
                                      main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 19ba00f28547..f4c2a9eb8c4d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -53,6 +53,15 @@
 #define OCFS2_MAX_TO_STEAL              1024
+struct ocfs2_suballoc_result {
+        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
+                                           to 0 when a block group is
+                                           contiguous. */
+        u64             sr_blkno;       /* The first allocated block */
+        unsigned int    sr_bit_offset;  /* The bit in the bg */
+        unsigned int    sr_bits;        /* How many bits we claimed */
+};
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -60,6 +69,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
@@ -73,20 +83,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found);
+                                      struct ocfs2_suballoc_result *res);
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found);
+                                    struct ocfs2_suballoc_result *res);
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res);
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno);
 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr);
 static inline int ocfs2_block_group_set_bits(handle_t *handle,
@@ -130,6 +137,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
        }
        brelse(ac->ac_bh);
        ac->ac_bh = NULL;
+        ac->ac_resv = NULL;
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -325,14 +333,38 @@ out:
        return rc;
 }
+static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_chain_list *cl,
+                                          u64 p_blkno, u32 clusters)
+{
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        struct ocfs2_extent_rec *rec;
+        BUG_ON(!ocfs2_supports_discontig_bg(osb));
+        if (!el->l_next_free_rec)
+                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
+        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
+        rec->e_blkno = cpu_to_le64(p_blkno);
+        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
+                                  le16_to_cpu(cl->cl_bpc));
+        rec->e_leaf_clusters = cpu_to_le32(clusters);
+        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&bg->bg_free_bits_count,
+                     clusters * le16_to_cpu(cl->cl_bpc));
+        le16_add_cpu(&el->l_next_free_rec, 1);
+}
 static int ocfs2_block_group_fill(handle_t *handle,
                                  struct inode *alloc_inode,
                                  struct buffer_head *bg_bh,
                                  u64 group_blkno,
+                                  unsigned int group_clusters,
                                  u16 my_chain,
                                  struct ocfs2_chain_list *cl)
 {
        int status = 0;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct super_block * sb = alloc_inode->i_sb;
@@ -359,19 +391,23 @@ static int ocfs2_block_group_fill(handle_t *handle,
        memset(bg, 0, sb->s_blocksize);
        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
-        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
+        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
-        bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+                                                osb->s_feature_incompat));
        bg->bg_chain = cpu_to_le16(my_chain);
        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
        bg->bg_blkno = cpu_to_le64(group_blkno);
+        if (group_clusters == le16_to_cpu(cl->cl_cpg))
+                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
+        else
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
+                                              group_clusters);
        /* set the 1st bit in the bitmap to account for the descriptor block */
        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
-        status = ocfs2_journal_dirty(handle, bg_bh);
+        ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0)
-                mlog_errno(status);
        /* There is no need to zero out or otherwise initialize the
         * other blocks in a group - All valid FS metadata in a block
@@ -397,6 +433,238 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
        return best;
 }
+static struct buffer_head *
+ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
+                               struct inode *alloc_inode,
+                               struct ocfs2_alloc_context *ac,
+                               struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        struct buffer_head *bg_bh;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        status = ocfs2_claim_clusters(handle, ac,
+                                      le16_to_cpu(cl->cl_cpg), &bit_off,
+                                      &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                brelse(bg_bh);
+                mlog_errno(status);
+        }
+bail:
+        return status ? ERR_PTR(status) : bg_bh;
+}
+static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
+                                        handle_t *handle,
+                                        struct ocfs2_alloc_context *ac,
+                                        unsigned int min_bits,
+                                        u32 *bit_off, u32 *num_bits)
+{
+        int status = 0;
+        while (min_bits) {
+                status = ocfs2_claim_clusters(handle, ac, min_bits,
+                                              bit_off, num_bits);
+                if (status != -ENOSPC)
+                        break;
+                min_bits >>= 1;
+        }
+        return status;
+}
+static int ocfs2_block_group_grow_discontig(handle_t *handle,
+                                            struct inode *alloc_inode,
+                                            struct buffer_head *bg_bh,
+                                            struct ocfs2_alloc_context *ac,
+                                            struct ocfs2_chain_list *cl,
+                                            unsigned int min_bits)
+{
+        int status;
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        struct ocfs2_group_desc *bg =
+                (struct ocfs2_group_desc *)bg_bh->b_data;
+        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        u32 p_cpos, clusters;
+        u64 p_blkno;
+        struct ocfs2_extent_list *el = &bg->bg_list;
+        status = ocfs2_journal_access_gd(handle,
+                                         INODE_CACHE(alloc_inode),
+                                         bg_bh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
+                                le16_to_cpu(el->l_count))) {
+                if (min_bits > needed)
+                        min_bits = needed;
+                status = ocfs2_block_group_claim_bits(osb, handle, ac,
+                                                      min_bits, &p_cpos,
+                                                      &clusters);
+                if (status < 0) {
+                        if (status != -ENOSPC)
+                                mlog_errno(status);
+                        goto bail;
+                }
+                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
+                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
+                                              clusters);
+                min_bits = clusters;
+                needed = le16_to_cpu(cl->cl_cpg) -
+                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
+        }
+        if (needed > 0) {
+                /*
+                 * We have used up all the extent rec but can't fill up
+                 * the cpg. So bail out.
+                 */
+                status = -ENOSPC;
+                goto bail;
+        }
+        ocfs2_journal_dirty(handle, bg_bh);
+bail:
+        return status;
+}
+static void ocfs2_bg_alloc_cleanup(handle_t *handle,
+                                   struct ocfs2_alloc_context *cluster_ac,
+                                   struct inode *alloc_inode,
+                                   struct buffer_head *bg_bh)
+{
+        int i, ret;
+        struct ocfs2_group_desc *bg;
+        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_rec *rec;
+        if (!bg_bh)
+                return;
+        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
+        el = &bg->bg_list;
+        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+                rec = &el->l_recs[i];
+                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
+                                          cluster_ac->ac_bh,
+                                          le64_to_cpu(rec->e_blkno),
+                                          le32_to_cpu(rec->e_leaf_clusters));
+                if (ret)
+                        mlog_errno(ret);
+                /* Try all the clusters to free */
+        }
+        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
+        brelse(bg_bh);
+}
+static struct buffer_head *
+ocfs2_block_group_alloc_discontig(handle_t *handle,
+                                  struct inode *alloc_inode,
+                                  struct ocfs2_alloc_context *ac,
+                                  struct ocfs2_chain_list *cl)
+{
+        int status;
+        u32 bit_off, num_bits;
+        u64 bg_blkno;
+        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
+        struct buffer_head *bg_bh = NULL;
+        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
+        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
+        if (!ocfs2_supports_discontig_bg(osb)) {
+                status = -ENOSPC;
+                goto bail;
+        }
+        status = ocfs2_extend_trans(handle,
+                                    ocfs2_calc_bg_discontig_credits(osb->sb));
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
+        /*
+         * We're going to be grabbing from multiple cluster groups.
+         * We don't have enough credits to relink them all, and the
+         * cluster groups will be staying in cache for the duration of
+         * this operation.
+         */
+        ac->ac_allow_chain_relink = 0;
+        /* Claim the first region */
+        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
+                                              &bit_off, &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        min_bits = num_bits;
+        /* setup the group */
+        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "new descriptor, record %u, at block %llu\n",
+             alloc_rec, (unsigned long long)bg_blkno);
+        bg_bh = sb_getblk(osb->sb, bg_blkno);
+        if (!bg_bh) {
+                status = -EIO;
+                mlog_errno(status);
+                goto bail;
+        }
+        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
+        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
+                                        bg_blkno, num_bits, alloc_rec, cl);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
+                                                  bg_bh, ac, cl, min_bits);
+        if (status)
+                mlog_errno(status);
+bail:
+        if (status)
+                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
+        return status ? ERR_PTR(status) : bg_bh;
+}
 /*
 * We expect the block group allocator to already be locked.
 */
@@ -412,9 +680,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        struct ocfs2_chain_list *cl;
        struct ocfs2_alloc_context *ac = NULL;
        handle_t *handle = NULL;
-        u32 bit_off, num_bits;
        u16 alloc_rec;
-        u64 bg_blkno;
        struct buffer_head *bg_bh = NULL;
        struct ocfs2_group_desc *bg;
@@ -447,44 +713,20 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                     (unsigned long long)*last_alloc_group);
                ac->ac_last_group = *last_alloc_group;
        }
-        status = ocfs2_claim_clusters(osb,
-                                      handle,
+        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
-                                      ac,
+                                               ac, cl);
-                                      le16_to_cpu(cl->cl_cpg),
+        if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
-                                      &bit_off,
+                bg_bh = ocfs2_block_group_alloc_discontig(handle,
-                                      &num_bits);
+                                                          alloc_inode,
-        if (status < 0) {
+                                                          ac, cl);
+        if (IS_ERR(bg_bh)) {
+                status = PTR_ERR(bg_bh);
+                bg_bh = NULL;
                if (status != -ENOSPC)
                        mlog_errno(status);
                goto bail;
        }
-        alloc_rec = ocfs2_find_smallest_chain(cl);
-        /* setup the group */
-        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "new descriptor, record %u, at block %llu\n",
-             alloc_rec, (unsigned long long)bg_blkno);
-        bg_bh = sb_getblk(osb->sb, bg_blkno);
-        if (!bg_bh) {
-                status = -EIO;
-                mlog_errno(status);
-                goto bail;
-        }
-        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
-        status = ocfs2_block_group_fill(handle,
-                                        alloc_inode,
-                                        bg_bh,
-                                        bg_blkno,
-                                        alloc_rec,
-                                        cl);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
@@ -494,10 +736,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
+        alloc_rec = le16_to_cpu(bg->bg_chain);
        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
                     le16_to_cpu(bg->bg_free_bits_count));
-        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
+        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
-        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
+                     le16_to_cpu(bg->bg_bits));
+        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
                le16_add_cpu(&cl->cl_next_free_rec, 1);
@@ -506,11 +750,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
-        status = ocfs2_journal_dirty(handle, bh);
+        ocfs2_journal_dirty(handle, bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -760,7 +1000,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
                                             (u32)osb->slot_num, NULL,
-                                             ALLOC_NEW_GROUP);
+                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
        if (status >= 0) {
@@ -946,11 +1186,7 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
-                if (status == -EFBIG) {
+                if ((status < 0) && (status != -ENOSPC)) {
-                        /* The local alloc window is outside ac_max_block.
-                         * use the main bitmap. */
-                        status = -ENOSPC;
-                } else if ((status < 0) && (status != -ENOSPC)) {
                        mlog_errno(status);
                        goto bail;
                }
@@ -1033,8 +1269,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                                             struct buffer_head *bg_bh,
                                             unsigned int bits_wanted,
                                             unsigned int total_bits,
-                                             u16 *bit_off,
+                                             struct ocfs2_suballoc_result *res)
-                                             u16 *bits_found)
 {
        void *bitmap;
        u16 best_offset, best_size;
@@ -1078,14 +1313,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
                }
        }
-        /* XXX: I think the first clause is equivalent to the second
+        if (best_size) {
-         *      - jlbec */
+                res->sr_bit_offset = best_offset;
-        if (found == bits_wanted) {
+                res->sr_bits = best_size;
-                *bit_off = start - found;
-                *bits_found = found;
-        } else if (best_size) {
-                *bit_off = best_offset;
-                *bits_found = best_size;
        } else {
                status = -ENOSPC;
                /* No error log here -- see the comment above
@@ -1129,16 +1359,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        }
        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
        while(num_bits--)
                ocfs2_set_bit(bit_off++, bitmap);
-        status = ocfs2_journal_dirty(handle,
+        ocfs2_journal_dirty(handle, group_bh);
-                                     group_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        mlog_exit(status);
@@ -1202,12 +1426,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        prev_bg->bg_next_group = bg->bg_next_group;
+        ocfs2_journal_dirty(handle, prev_bg_bh);
-        status = ocfs2_journal_dirty(handle, prev_bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1217,12 +1436,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
+        ocfs2_journal_dirty(handle, bg_bh);
-        status = ocfs2_journal_dirty(handle, bg_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1232,14 +1446,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        }
        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
+        ocfs2_journal_dirty(handle, fe_bh);
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto out_rollback;
-        }
-        status = 0;
 out_rollback:
        if (status < 0) {
                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
@@ -1263,14 +1471,13 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
                                      u64 max_block,
-                                      u16 *bit_off, u16 *bits_found)
+                                      struct ocfs2_suballoc_result *res)
 {
        int search = -ENOSPC;
        int ret;
        u64 blkoff;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u16 tmp_off, tmp_found;
        unsigned int max_bits, gd_cluster_off;
        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
@@ -1297,15 +1504,15 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
-                                                        max_bits,
+                                                        max_bits, res);
-                                                        &tmp_off, &tmp_found);
                if (ret)
                        return ret;
                if (max_block) {
                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
                                                          gd_cluster_off +
-                                                          tmp_off + tmp_found);
+                                                          res->sr_bit_offset +
+                                                          res->sr_bits);
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1317,16 +1524,14 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                 * return success, but we still want to return
                 * -ENOSPC unless it found the minimum number
                 * of bits. */
-                if (min_bits <= tmp_found) {
+                if (min_bits <= res->sr_bits)
-                        *bit_off = tmp_off;
-                        *bits_found = tmp_found;
                        search = 0; /* success */
-                } else if (tmp_found) {
+                else if (res->sr_bits) {
                        /*
                         * Don't show bits which we'll be returning
                         * for allocation to the local alloc bitmap.
                         */
-                        ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
+                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
                }
        }
@@ -1337,7 +1542,7 @@ static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
                                    u64 max_block,
-                                    u16 *bit_off, u16 *bits_found)
+                                    struct ocfs2_suballoc_result *res)
 {
        int ret = -ENOSPC;
        u64 blkoff;
@@ -1350,10 +1555,10 @@ static int ocfs2_block_group_search(struct inode *inode,
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
                                                        le16_to_cpu(bg->bg_bits),
-                                                        bit_off, bits_found);
+                                                        res);
                if (!ret && max_block) {
-                        blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+                        blkoff = le64_to_cpu(bg->bg_blkno) +
-                                *bits_found;
+                                res->sr_bit_offset + res->sr_bits;
                        mlog(0, "Checking %llu against %llu\n",
                             (unsigned long long)blkoff,
                             (unsigned long long)max_block);
@@ -1386,33 +1591,76 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
+        ocfs2_journal_dirty(handle, di_bh);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
 }
+static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
+                                         struct ocfs2_extent_rec *rec,
+                                         struct ocfs2_chain_list *cl)
+{
+        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
+        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
+        unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+        if (res->sr_bit_offset < bitoff)
+                return 0;
+        if (res->sr_bit_offset >= (bitoff + bitcount))
+                return 0;
+        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
+                (res->sr_bit_offset - bitoff);
+        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
+                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
+        return 1;
+}
+static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
+                                          struct ocfs2_group_desc *bg,
+                                          struct ocfs2_suballoc_result *res)
+{
+        int i;
+        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
+        struct ocfs2_extent_rec *rec;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
+        struct ocfs2_chain_list *cl = &di->id2.i_chain;
+        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
+                res->sr_blkno = 0;
+                return;
+        }
+        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
+        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
+        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
+            !bg->bg_list.l_next_free_rec)
+                return;
+        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
+                rec = &bg->bg_list.l_recs[i];
+                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
+                        res->sr_bg_blkno = bg_blkno;  /* Restore */
+                        break;
+                }
+        }
+}
 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
                                  handle_t *handle,
                                  u32 bits_wanted,
                                  u32 min_bits,
-                                  u16 *bit_off,
+                                  struct ocfs2_suballoc_result *res,
-                                  unsigned int *num_bits,
-                                  u64 gd_blkno,
                                  u16 *bits_left)
 {
        int ret;
-        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+        ret = ocfs2_read_group_descriptor(alloc_inode, di,
-                                          &group_bh);
+                                          res->sr_bg_blkno, &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1420,17 +1668,18 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-                                  ac->ac_max_block, bit_off, &found);
+                                  ac->ac_max_block, res);
        if (ret < 0) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
                goto out;
        }
-        *num_bits = found;
+        if (!ret)
+                ocfs2_bg_discontig_fix_result(ac, gd, res);
        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
-                                               *num_bits,
+                                               res->sr_bits,
                                               le16_to_cpu(gd->bg_chain));
        if (ret < 0) {
                mlog_errno(ret);
@@ -1438,7 +1687,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        }
        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
-                                         *bit_off, *num_bits);
+                                         res->sr_bit_offset, res->sr_bits);
        if (ret < 0)
                mlog_errno(ret);
@@ -1454,13 +1703,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                              handle_t *handle,
                              u32 bits_wanted,
                              u32 min_bits,
-                              u16 *bit_off,
+                              struct ocfs2_suballoc_result *res,
-                              unsigned int *num_bits,
-                              u64 *bg_blkno,
                              u16 *bits_left)
 {
        int status;
-        u16 chain, tmp_bits;
+        u16 chain;
        u32 tmp_used;
        u64 next_group;
        struct inode *alloc_inode = ac->ac_inode;
@@ -1489,8 +1736,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         * the 1st group with any empty bits. */
        while ((status = ac->ac_group_search(alloc_inode, group_bh,
                                             bits_wanted, min_bits,
-                                             ac->ac_max_block, bit_off,
+                                             ac->ac_max_block,
-                                             &tmp_bits)) == -ENOSPC) {
+                                             res)) == -ENOSPC) {
                if (!bg->bg_next_group)
                        break;
@@ -1515,11 +1762,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-             tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+             res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
-        *num_bits = tmp_bits;
+        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
+        BUG_ON(res->sr_bits == 0);
+        if (!status)
+                ocfs2_bg_discontig_fix_result(ac, bg, res);
-        BUG_ON(*num_bits == 0);
        /*
         * Keep track of previous block descriptor read. When
@@ -1536,7 +1786,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
         */
        if (ac->ac_allow_chain_relink &&
            (prev_group_bh) &&
-            (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
+            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
                status = ocfs2_relink_block_group(handle, alloc_inode,
                                                  ac->ac_bh, group_bh,
                                                  prev_group_bh, chain);
@@ -1558,31 +1808,24 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        }
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
-        fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
+        fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used);
-        le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
+        le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits);
+        ocfs2_journal_dirty(handle, ac->ac_bh);
-        status = ocfs2_journal_dirty(handle,
-                                     ac->ac_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = ocfs2_block_group_set_bits(handle,
                                            alloc_inode,
                                            bg,
                                            group_bh,
-                                            *bit_off,
+                                            res->sr_bit_offset,
-                                            *num_bits);
+                                            res->sr_bits);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
+        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
             (unsigned long long)le64_to_cpu(fe->i_blkno));
-        *bg_blkno = le64_to_cpu(bg->bg_blkno);
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
        brelse(group_bh);
@@ -1593,19 +1836,15 @@ bail:
 }
 /* will give out up to bits_wanted contiguous bits. */
-static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
+static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
-                                     struct ocfs2_alloc_context *ac,
                                     handle_t *handle,
                                     u32 bits_wanted,
                                     u32 min_bits,
-                                     u16 *bit_off,
+                                     struct ocfs2_suballoc_result *res)
-                                     unsigned int *num_bits,
-                                     u64 *bg_blkno)
 {
        int status;
        u16 victim, i;
        u16 bits_left = 0;
-        u64 hint_blkno = ac->ac_last_group;
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
@@ -1623,7 +1862,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
-                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
+                ocfs2_error(ac->ac_inode->i_sb,
+                            "Chain allocator dinode %llu has %u used "
                            "bits but only %u total.",
                            (unsigned long long)le64_to_cpu(fe->i_blkno),
                            le32_to_cpu(fe->id1.bitmap1.i_used),
@@ -1632,22 +1872,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                goto bail;
        }
-        if (hint_blkno) {
+        res->sr_bg_blkno = ac->ac_last_group;
+        if (res->sr_bg_blkno) {
                /* Attempt to short-circuit the usual search mechanism
                 * by jumping straight to the most recently used
                 * allocation group. This helps us mantain some
                 * contiguousness across allocations. */
                status = ocfs2_search_one_group(ac, handle, bits_wanted,
-                                                min_bits, bit_off, num_bits,
+                                                min_bits, res, &bits_left);
-                                                hint_blkno, &bits_left);
+                if (!status)
-                if (!status) {
-                        /* Be careful to update *bg_blkno here as the
-                         * caller is expecting it to be filled in, and
-                         * ocfs2_search_one_group() won't do that for
-                         * us. */
-                        *bg_blkno = hint_blkno;
                        goto set_hint;
-                }
                if (status < 0 && status != -ENOSPC) {
                        mlog_errno(status);
                        goto bail;
@@ -1660,8 +1894,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_chain = victim;
        ac->ac_allow_chain_relink = 1;
-        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
+        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                    num_bits, bg_blkno, &bits_left);
+                                    res, &bits_left);
        if (!status)
                goto set_hint;
        if (status < 0 && status != -ENOSPC) {
@@ -1685,8 +1919,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                ac->ac_chain = i;
                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
-                                            bit_off, num_bits, bg_blkno,
+                                            res, &bits_left);
-                                            &bits_left);
                if (!status)
                        break;
                if (status < 0 && status != -ENOSPC) {
@@ -1703,7 +1936,7 @@ set_hint:
                if (bits_left < min_bits)
                        ac->ac_last_group = 0;
                else
-                        ac->ac_last_group = *bg_blkno;
+                        ac->ac_last_group = res->sr_bg_blkno;
        }
 bail:
@@ -1711,37 +1944,37 @@ bail:
        return status;
 }
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         unsigned int *num_bits,
                         u64 *blkno_start)
 {
        int status;
-        u64 bg_blkno;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
        BUG_ON(!ac);
        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           bits_wanted,
                                           1,
-                                           suballoc_bit_start,
+                                           &res);
-                                           num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
+        *suballoc_loc = res.sr_bg_blkno;
-        ac->ac_bits_given += (*num_bits);
+        *suballoc_bit_start = res.sr_bit_offset;
+        *blkno_start = res.sr_blkno;
+        ac->ac_bits_given += res.sr_bits;
+        *num_bits = res.sr_bits;
        status = 0;
 bail:
        mlog_exit(status);
@@ -1749,10 +1982,10 @@ bail:
 }
 static void ocfs2_init_inode_ac_group(struct inode *dir,
-                                      struct buffer_head *parent_fe_bh,
+                                      struct buffer_head *parent_di_bh,
                                      struct ocfs2_alloc_context *ac)
 {
-        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
        /*
         * Try to allocate inodes from some specific group.
         *
@@ -1766,10 +1999,14 @@ static void ocfs2_init_inode_ac_group(struct inode *dir,
        if (OCFS2_I(dir)->ip_last_used_group &&
            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
-        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
-                ac->ac_last_group = ocfs2_which_suballoc_group(
+                if (di->i_suballoc_loc)
-                                        le64_to_cpu(fe->i_blkno),
+                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
-                                        le16_to_cpu(fe->i_suballoc_bit));
+                else
+                        ac->ac_last_group = ocfs2_which_suballoc_group(
+                                        le64_to_cpu(di->i_blkno),
+                                        le16_to_cpu(di->i_suballoc_bit));
+        }
 }
 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
@@ -1779,17 +2016,16 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir,
        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
 }
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
 {
        int status;
-        unsigned int num_bits;
+        struct ocfs2_suballoc_result res;
-        u64 bg_blkno;
        mlog_entry_void();
@@ -1800,23 +2036,22 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
-        status = ocfs2_claim_suballoc_bits(osb,
+        status = ocfs2_claim_suballoc_bits(ac,
-                                           ac,
                                           handle,
                                           1,
                                           1,
-                                           suballoc_bit,
+                                           &res);
-                                           &num_bits,
-                                           &bg_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        atomic_inc(&osb->alloc_stats.bg_allocs);
+        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
-        BUG_ON(num_bits != 1);
+        BUG_ON(res.sr_bits != 1);
-        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
+        *suballoc_loc = res.sr_bg_blkno;
+        *suballoc_bit = res.sr_bit_offset;
+        *fe_blkno = res.sr_blkno;
        ac->ac_bits_given++;
        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
@@ -1886,8 +2121,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 * contig. allocation, set to '1' to indicate we can deal with extents
 * of any size.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
@@ -1896,8 +2130,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        int status;
        unsigned int bits_wanted = max_clusters;
-        u64 bg_blkno = 0;
+        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
-        u16 bg_bit_off;
+        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
        mlog_entry_void();
@@ -1907,6 +2141,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
               && ac->ac_which != OCFS2_AC_USE_MAIN);
        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
+                WARN_ON(min_clusters > 1);
                status = ocfs2_claim_local_alloc_bits(osb,
                                                      handle,
                                                      ac,
@@ -1929,20 +2165,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb,
                if (bits_wanted > (osb->bitmap_cpg - 1))
                        bits_wanted = osb->bitmap_cpg - 1;
-                status = ocfs2_claim_suballoc_bits(osb,
+                status = ocfs2_claim_suballoc_bits(ac,
-                                                   ac,
                                                   handle,
                                                   bits_wanted,
                                                   min_clusters,
-                                                   &bg_bit_off,
+                                                   &res);
-                                                   num_clusters,
-                                                   &bg_blkno);
                if (!status) {
+                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
                        *cluster_start =
                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
-                                                                 bg_blkno,
+                                                                 res.sr_bg_blkno,
-                                                                 bg_bit_off);
+                                                                 res.sr_bit_offset);
                        atomic_inc(&osb->alloc_stats.bitmap_data);
+                        *num_clusters = res.sr_bits;
                }
        }
        if (status < 0) {
@@ -1958,8 +2193,7 @@ bail:
        return status;
 }
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -1967,7 +2201,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 {
        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
-        return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
+        return __ocfs2_claim_clusters(handle, ac, min_clusters,
                                      bits_wanted, cluster_start, num_clusters);
 }
@@ -2023,9 +2257,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        if (undo_fn)
                jbd_unlock_bh_state(group_bh);
-        status = ocfs2_journal_dirty(handle, group_bh);
+        ocfs2_journal_dirty(handle, group_bh);
-        if (status < 0)
-                mlog_errno(status);
 bail:
        return status;
 }
@@ -2092,12 +2324,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
                     count);
        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
+        ocfs2_journal_dirty(handle, alloc_bh);
-        status = ocfs2_journal_dirty(handle, alloc_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto bail;
-        }
 bail:
        brelse(group_bh);
@@ -2126,6 +2353,8 @@ int ocfs2_free_dinode(handle_t *handle,
        u16 bit = le16_to_cpu(di->i_suballoc_bit);
        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (di->i_suballoc_loc)
+                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
                                        inode_alloc_bh, bit, bg_blkno, 1);
 }
@@ -2395,7 +2624,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
                                   struct buffer_head *alloc_bh, u64 blkno,
                                   u16 bit, int *res)
 {
-        struct ocfs2_dinode *alloc_fe;
+        struct ocfs2_dinode *alloc_di;
        struct ocfs2_group_desc *group;
        struct buffer_head *group_bh = NULL;
        u64 bg_blkno;
@@ -2404,17 +2633,20 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
                   (unsigned int)bit);
-        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
-        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
                     (unsigned int)bit,
-                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
                status = -EINVAL;
                goto bail;
        }
-        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        if (alloc_di->i_suballoc_loc)
-        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
                                             &group_bh);
        if (status < 0) {
                mlog(ML_ERROR, "read group %llu failed %d\n",
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index e0f46df357e6..a017dd3ee7d9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -26,13 +26,14 @@
 #ifndef _CHAINALLOC_H_
 #define _CHAINALLOC_H_
+struct ocfs2_suballoc_result;
 typedef int (group_search_t)(struct inode *,
                             struct buffer_head *,
                             u32,                       /* bits_wanted */
                             u32,                       /* min_bits */
                             u64,                       /* max_block */
-                             u16 *,                     /* *bit_off */
+                             struct ocfs2_suballoc_result *);
-                             u16 *);                    /* *bits_found */
+                                                        /* found bits */
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -54,6 +55,8 @@ struct ocfs2_alloc_context {
        u64    ac_last_group;
        u64    ac_max_block;  /* Highest block number to allocate. 0 is
                                 is the same as ~0 - unlimited */
+        struct ocfs2_alloc_reservation  *ac_resv;
 };
 void ocfs2_init_steal_slots(struct ocfs2_super *osb);
@@ -80,22 +83,21 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac);
-int ocfs2_claim_metadata(struct ocfs2_super *osb,
+int ocfs2_claim_metadata(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 bits_wanted,
+                         u64 *suballoc_loc,
                         u16 *suballoc_bit_start,
                         u32 *num_bits,
                         u64 *blkno_start);
-int ocfs2_claim_new_inode(struct ocfs2_super *osb,
+int ocfs2_claim_new_inode(handle_t *handle,
-                          handle_t *handle,
                          struct inode *dir,
                          struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
+                          u64 *suballoc_loc,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
-int ocfs2_claim_clusters(struct ocfs2_super *osb,
+int ocfs2_claim_clusters(handle_t *handle,
-                         handle_t *handle,
                         struct ocfs2_alloc_context *ac,
                         u32 min_clusters,
                         u32 *cluster_start,
@@ -104,8 +106,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 * Use this variant of ocfs2_claim_clusters to specify a maxiumum
 * number of clusters smaller than the allocation reserved.
 */
-int __ocfs2_claim_clusters(struct ocfs2_super *osb,
+int __ocfs2_claim_clusters(handle_t *handle,
-                           handle_t *handle,
                           struct ocfs2_alloc_context *ac,
                           u32 min_clusters,
                           u32 max_clusters,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index dee03197a494..2c26ce251cb3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -94,7 +94,9 @@ struct mount_options
        unsigned long   mount_opt;
        unsigned int    atime_quantum;
        signed short    slot;
-        unsigned int    localalloc_opt;
+        int             localalloc_opt;
+        unsigned int    resv_level;
+        int             dir_resv_level;
        char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
@@ -176,6 +178,8 @@ enum {
        Opt_noacl,
        Opt_usrquota,
        Opt_grpquota,
+        Opt_resv_level,
+        Opt_dir_resv_level,
        Opt_err,
 };
@@ -202,6 +206,8 @@ static const match_table_t tokens = {
        {Opt_noacl, "noacl"},
        {Opt_usrquota, "usrquota"},
        {Opt_grpquota, "grpquota"},
+        {Opt_resv_level, "resv_level=%u"},
+        {Opt_dir_resv_level, "dir_resv_level=%u"},
        {Opt_err, NULL}
 };
@@ -932,12 +938,16 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        int type;
        struct inode *inode;
        struct super_block *sb = osb->sb;
+        struct ocfs2_mem_dqinfo *oinfo;
        /* We mostly ignore errors in this function because there's not much
         * we can do when we see them */
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_loaded(sb, type))
                        continue;
+                /* Cancel periodic syncing before we grab dqonoff_mutex */
+                oinfo = sb_dqinfo(sb, type)->dqi_priv;
+                cancel_delayed_work_sync(&oinfo->dqi_sync_work);
                inode = igrab(sb->s_dquot.files[type]);
                /* Turn off quotas. This will remove all dquot structures from
                 * memory and so they will be automatically synced to global
@@ -1028,8 +1038,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
-        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
-        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
+        osb->osb_resv_level = parsed_options.resv_level;
+        osb->osb_dir_resv_level = parsed_options.resv_level;
+        if (parsed_options.dir_resv_level == -1)
+                osb->osb_dir_resv_level = parsed_options.resv_level;
+        else
+                osb->osb_dir_resv_level = parsed_options.dir_resv_level;
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1285,11 +1301,13 @@ static int ocfs2_parse_options(struct super_block *sb,
                   options ? options : "(none)");
        mopt->commit_interval = 0;
-        mopt->mount_opt = 0;
+        mopt->mount_opt = OCFS2_MOUNT_NOINTR;
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
-        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+        mopt->localalloc_opt = -1;
        mopt->cluster_stack[0] = '\0';
+        mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+        mopt->dir_resv_level = -1;
        if (!options) {
                status = 1;
@@ -1380,7 +1398,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                                status = 0;
                                goto bail;
                        }
-                        if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+                        if (option >= 0)
                                mopt->localalloc_opt = option;
                        break;
                case Opt_localflocks:
@@ -1433,6 +1451,28 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
+                case Opt_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->resv_level = option;
+                        break;
+                case Opt_dir_resv_level:
+                        if (is_remount)
+                                break;
+                        if (match_int(&args[0], &option)) {
+                                status = 0;
+                                goto bail;
+                        }
+                        if (option >= OCFS2_MIN_RESV_LEVEL &&
+                            option < OCFS2_MAX_RESV_LEVEL)
+                                mopt->dir_resv_level = option;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1487,7 +1527,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                           (unsigned) (osb->osb_commit_interval / HZ));
        local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-        if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+        if (local_alloc_megs != ocfs2_la_default_mb(osb))
                seq_printf(s, ",localalloc=%d", local_alloc_megs);
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -1514,6 +1554,12 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        else
                seq_printf(s, ",noacl");
+        if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
+                seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+        if (osb->osb_dir_resv_level != osb->osb_resv_level)
+                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
        return 0;
 }
@@ -1688,6 +1734,8 @@ static void ocfs2_inode_init_once(void *data)
        oi->ip_blkno = 0ULL;
        oi->ip_clusters = 0;
+        ocfs2_resv_init_once(&oi->ip_la_data_resv);
        ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
@@ -2042,6 +2090,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        init_waitqueue_head(&osb->osb_mount_event);
+        status = ocfs2_resmap_init(osb, &osb->osb_la_resmap);
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
        if (!osb->vol_label) {
                mlog(ML_ERROR, "unable to alloc vol label\n");
@@ -2224,9 +2278,11 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+        osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
        iput(inode);
-        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
+        osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0,
+                                 osb->s_feature_incompat) * 8;
        status = ocfs2_init_slot_info(osb);
        if (status < 0) {
@@ -2509,5 +2565,25 @@ void __ocfs2_abort(struct super_block* sb,
        ocfs2_handle_error(sb);
 }
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset)
+{
+        int rc;
+        sigset_t blocked;
+        sigfillset(&blocked);
+        rc = sigprocmask(SIG_BLOCK, &blocked, oldset);
+        BUG_ON(rc);
+}
+void ocfs2_unblock_signals(sigset_t *oldset)
+{
+        int rc = sigprocmask(SIG_SETMASK, oldset, NULL);
+        BUG_ON(rc);
+}
 module_init(ocfs2_init);
 module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f5270f2a1..40c7de084c10 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,11 @@ void __ocfs2_abort(struct super_block *sb,
 #define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+/*
+ * Void signal blockers, because in-kernel sigprocmask() only fails
+ * when SIG_* is wrong.
+ */
+void ocfs2_block_signals(sigset_t *oldset);
+void ocfs2_unblock_signals(sigset_t *oldset);
 #endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3e7773089b96..e97b34842cfe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -79,6 +79,7 @@ struct ocfs2_xattr_set_ctxt {
        struct ocfs2_alloc_context *meta_ac;
        struct ocfs2_alloc_context *data_ac;
        struct ocfs2_cached_dealloc_ctxt dealloc;
+        int set_abort;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
@@ -96,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
 };
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
        &ocfs2_xattr_acl_access_handler,
        &ocfs2_xattr_acl_default_handler,
@@ -105,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
        NULL
 };
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
                                        = &ocfs2_xattr_acl_access_handler,
@@ -539,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
-        struct xattr_handler *handler = NULL;
+        const struct xattr_handler *handler = NULL;
        if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
                handler = ocfs2_xattr_handler_map[name_index];
@@ -739,11 +740,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, vb->vb_bh);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
@@ -786,12 +783,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
        }
        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
+        ocfs2_journal_dirty(handle, vb->vb_bh);
-        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        if (ext_flags & OCFS2_EXT_REFCOUNTED)
                ret = ocfs2_decrease_refcount(inode, handle,
@@ -1374,11 +1366,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                memset(bh->b_data + cp_len, 0,
                                       blocksize - cp_len);
-                        ret = ocfs2_journal_dirty(handle, bh);
+                        ocfs2_journal_dirty(handle, bh);
-                        if (ret < 0) {
-                                mlog_errno(ret);
-                                goto out;
-                        }
                        brelse(bh);
                        bh = NULL;
@@ -2148,15 +2136,19 @@ alloc_value:
                orig_clusters = ocfs2_xa_value_clusters(loc);
                rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
                if (rc < 0) {
-                        /*
+                        ctxt->set_abort = 1;
-                         * If we tried to grow an existing external value,
-                         * ocfs2_xa_cleanuP-value_truncate() is going to
-                         * let it stand.  We have to restore its original
-                         * value size.
-                         */
-                        loc->xl_entry->xe_value_size = orig_value_size;
                        ocfs2_xa_cleanup_value_truncate(loc, "growing",
                                                        orig_clusters);
+                        /*
+                         * If we were growing an existing value,
+                         * ocfs2_xa_cleanup_value_truncate() won't remove
+                         * the entry. We need to restore the original value
+                         * size.
+                         */
+                        if (loc->xl_entry) {
+                                BUG_ON(!orig_value_size);
+                                loc->xl_entry->xe_value_size = orig_value_size;
+                        }
                        mlog_errno(rc);
                }
        }
@@ -2479,7 +2471,10 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
-        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        if (xb->xb_suballoc_loc)
+                bg_blkno = le64_to_cpu(xb->xb_suballoc_loc);
+        else
+                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
        xb_alloc_inode = ocfs2_get_system_file_inode(osb,
                                EXTENT_ALLOC_SYSTEM_INODE,
@@ -2594,9 +2589,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ocfs2_journal_dirty(handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
@@ -2724,9 +2717,7 @@ static int ocfs2_xattr_ibody_init(struct inode *inode,
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
+        ocfs2_journal_dirty(ctxt->handle, di_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 out:
        return ret;
@@ -2846,9 +2837,8 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        int ret;
        u16 suballoc_bit_start;
        u32 num_got;
-        u64 first_blkno;
+        u64 suballoc_loc, first_blkno;
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *new_bh = NULL;
        struct ocfs2_xattr_block *xblk;
@@ -2859,9 +2849,9 @@ static int ocfs2_create_xattr_block(struct inode *inode,
                goto end;
        }
-        ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
+        ret = ocfs2_claim_metadata(ctxt->handle, ctxt->meta_ac, 1,
-                                   &suballoc_bit_start, &num_got,
+                                   &suballoc_loc, &suballoc_bit_start,
-                                   &first_blkno);
+                                   &num_got, &first_blkno);
        if (ret < 0) {
                mlog_errno(ret);
                goto end;
@@ -2883,8 +2873,10 @@ static int ocfs2_create_xattr_block(struct inode *inode,
        memset(xblk, 0, inode->i_sb->s_blocksize);
        strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
        xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
+        xblk->xb_suballoc_loc = cpu_to_le64(suballoc_loc);
        xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-        xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+        xblk->xb_fs_generation =
+                cpu_to_le32(OCFS2_SB(inode->i_sb)->fs_generation);
        xblk->xb_blkno = cpu_to_le64(first_blkno);
        if (indexed) {
                struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
@@ -2956,7 +2948,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                ret = ocfs2_xa_set(&loc, xi, ctxt);
                if (!ret)
                        xs->here = loc.xl_entry;
-                else if (ret != -ENOSPC)
+                else if ((ret != -ENOSPC) || ctxt->set_abort)
                        goto end;
                else {
                        ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
@@ -3312,14 +3304,13 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                goto out;
                        }
-                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                        ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
-                } else if (ret == -ENOSPC) {
+                } else if ((ret == -ENOSPC) && !ctxt->set_abort) {
                        if (di->i_xattr_loc && !xbs->xattr_bh) {
                                ret = ocfs2_xattr_block_find(inode,
                                                             xi->xi_name_index,
@@ -3343,8 +3334,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                        ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -3378,8 +3368,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
                                        goto out;
                                }
-                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                ret = ocfs2_extend_trans(ctxt->handle, credits);
-                                                ctxt->handle->h_buffer_credits);
                                if (ret) {
                                        mlog_errno(ret);
                                        goto out;
@@ -4249,7 +4238,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        u32 bit_off, len;
        u64 blkno;
        handle_t *handle = ctxt->handle;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
@@ -4277,7 +4265,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
                goto out;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac,
                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
@@ -4887,8 +4875,7 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
         * We need to update the first bucket of the old extent and all
         * the buckets going to the new extent.
         */
-        credits = ((num_buckets + 1) * blks_per_bucket) +
+        credits = ((num_buckets + 1) * blks_per_bucket);
-                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -4958,7 +4945,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
+        int ret, credits = 2 * blk_per_bucket;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -5099,7 +5086,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
+        ret = __ocfs2_claim_clusters(handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -5153,9 +5140,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                goto leave;
        }
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0)
-                mlog_errno(ret);
 leave:
        return ret;
@@ -5200,8 +5185,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
         * existing bucket.  Then we add the last existing bucket, the
         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket);
-                  handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -5477,12 +5461,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        }
        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+        ocfs2_journal_dirty(handle, root_bh);
-        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
        if (ret)
@@ -6935,7 +6914,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac,
+        ret = ocfs2_claim_clusters(handle, data_ac,
                                   len, &p_cluster, &num_clusters);
        if (ret) {
                mlog_errno(ret);
@@ -7234,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
                                     xattr_ac, data_ac);
 }
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = ocfs2_xattr_security_list,
        .get    = ocfs2_xattr_security_get,
@@ -7278,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .list   = ocfs2_xattr_trusted_list,
        .get    = ocfs2_xattr_trusted_get,
@@ -7334,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
                               name, value, size, flags);
 }
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .list   = ocfs2_xattr_user_list,
        .get    = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
        size_t value_len;
 };
-extern struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index b44bb835e8ea..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
                goto fail;
        inode->i_ino = new_block;
-        inode->i_mode = mode;
+        inode_init_owner(inode, NULL, mode);
-        inode->i_uid = current_fsuid();
-        inode->i_gid = current_fsgid();
        inode->i_mapping->a_ops = &omfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718e..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
 #include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
-#include <linux/vfs.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -33,171 +32,6 @@
 #include "internal.h"
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        int retval = -ENODEV;
-        if (dentry) {
-                retval = -ENOSYS;
-                if (dentry->d_sb->s_op->statfs) {
-                        memset(buf, 0, sizeof(*buf));
-                        retval = security_sb_statfs(dentry);
-                        if (retval)
-                                return retval;
-                        retval = dentry->d_sb->s_op->statfs(dentry, buf);
-                        if (retval == 0 && buf->f_frsize == 0)
-                                buf->f_frsize = buf->f_bsize;
-                }
-        }
-        return retval;
-}
-EXPORT_SYMBOL(vfs_statfs);
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
-{
-        struct kstatfs st;
-        int retval;
-        retval = vfs_statfs(dentry, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
-        else {
-                if (sizeof buf->f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
-                             st.f_bsize | st.f_frsize) &
-                            0xffffffff00000000ULL)
-                                return -EOVERFLOW;
-                        /*
-                         * f_files and f_ffree may be -1; it's okay to stuff
-                         * that into 32 bits
-                         */
-                        if (st.f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
-                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
-                                return -EOVERFLOW;
-                }
-                buf->f_type = st.f_type;
-                buf->f_bsize = st.f_bsize;
-                buf->f_blocks = st.f_blocks;
-                buf->f_bfree = st.f_bfree;
-                buf->f_bavail = st.f_bavail;
-                buf->f_files = st.f_files;
-                buf->f_ffree = st.f_ffree;
-                buf->f_fsid = st.f_fsid;
-                buf->f_namelen = st.f_namelen;
-                buf->f_frsize = st.f_frsize;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
-        }
-        return 0;
-}
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
-{
-        struct kstatfs st;
-        int retval;
-        retval = vfs_statfs(dentry, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
-        else {
-                buf->f_type = st.f_type;
-                buf->f_bsize = st.f_bsize;
-                buf->f_blocks = st.f_blocks;
-                buf->f_bfree = st.f_bfree;
-                buf->f_bavail = st.f_bavail;
-                buf->f_files = st.f_files;
-                buf->f_ffree = st.f_ffree;
-                buf->f_fsid = st.f_fsid;
-                buf->f_namelen = st.f_namelen;
-                buf->f_frsize = st.f_frsize;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
-        }
-        return 0;
-}
-SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
-{
-        struct path path;
-        int error;
-        error = user_path(pathname, &path);
-        if (!error) {
-                struct statfs tmp;
-                error = vfs_statfs_native(path.dentry, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
-        return error;
-}
-SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
-{
-        struct path path;
-        long error;
-        if (sz != sizeof(*buf))
-                return -EINVAL;
-        error = user_path(pathname, &path);
-        if (!error) {
-                struct statfs64 tmp;
-                error = vfs_statfs64(path.dentry, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
-        return error;
-}
-SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
-{
-        struct file * file;
-        struct statfs tmp;
-        int error;
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs_native(file->f_path.dentry, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
-        return error;
-}
-SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
-{
-        struct file * file;
-        struct statfs64 tmp;
-        int error;
-        if (sz != sizeof(*buf))
-                return -EINVAL;
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs64(file->f_path.dentry, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
-        return error;
-}
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
 {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
+static int riscix_partition(struct parsed_partitions *state,
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
+                            unsigned long first_sect, int slot,
-                unsigned long first_sect, int slot, unsigned long nr_sects)
+                            unsigned long nr_sects)
 {
        Sector sect;
        struct riscix_record *rr;
        
-        rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+        rr = read_part_sector(state, first_sect, &sect);
        if (!rr)
                return -1;
@@ -123,9 +123,9 @@ struct linux_part {
 #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
        defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
+static int linux_partition(struct parsed_partitions *state,
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
+                           unsigned long first_sect, int slot,
-                unsigned long first_sect, int slot, unsigned long nr_sects)
+                           unsigned long nr_sects)
 {
        Sector sect;
        struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
        put_partition(state, slot++, first_sect, size);
-        linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+        linuxp = read_part_sector(state, first_sect, &sect);
        if (!linuxp)
                return -1;
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
 #endif
 #ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
+int adfspart_check_CUMANA(struct parsed_partitions *state)
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
 {
        unsigned long first_sector = 0;
        unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
                struct adfs_discrecord *dr;
                unsigned int nr_sects;
-                data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+                data = read_part_sector(state, start_blk * 2 + 6, &sect);
                if (!data)
                        return -1;
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                        /* RISCiX - we don't know how to find the next one. */
-                        slot = riscix_partition(state, bdev, first_sector,
+                        slot = riscix_partition(state, first_sector, slot,
-                                                 slot, nr_sects);
+                                                nr_sects);
                        break;
 #endif
                case PARTITION_LINUX:
-                        slot = linux_partition(state, bdev, first_sector,
+                        slot = linux_partition(state, first_sector, slot,
-                                                slot, nr_sects);
+                                               nr_sects);
                        break;
                }
                put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
 *          hda1 = ADFS partition on first drive.
 *          hda2 = non-ADFS partition.
 */
-int
+int adfspart_check_ADFS(struct parsed_partitions *state)
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 {
        unsigned long start_sect, nr_sects, sectscyl, heads;
        Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        unsigned char id;
        int slot = 1;
-        data = read_dev_sector(bdev, 6, &sect);
+        data = read_part_sector(state, 6, &sect);
        if (!data)
                return -1;
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Work out start of non-adfs partition.
         */
-        nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+        nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
        if (start_sect) {
                switch (id) {
 #ifdef CONFIG_ACORN_PARTITION_RISCIX
                case PARTITION_RISCIX_SCSI:
                case PARTITION_RISCIX_MFM:
-                        slot = riscix_partition(state, bdev, start_sect,
+                        slot = riscix_partition(state, start_sect, slot,
-                                                 slot, nr_sects);
+                                                nr_sects);
                        break;
 #endif
                case PARTITION_LINUX:
-                        slot = linux_partition(state, bdev, start_sect,
+                        slot = linux_partition(state, start_sect, slot,
-                                                slot, nr_sects);
+                                               nr_sects);
                        break;
                }
        }
@@ -308,10 +306,11 @@ struct ics_part {
        __le32 size;
 };
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+                                   unsigned long block)
 {
        Sector sect;
-        unsigned char *data = read_dev_sector(bdev, block, &sect);
+        unsigned char *data = read_part_sector(state, block, &sect);
        int result = 0;
        if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
 *          hda2 = ADFS partition 1 on first drive.
 *              ..etc..
 */
-int
+int adfspart_check_ICS(struct parsed_partitions *state)
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
 {
        const unsigned char *data;
        const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Try ICS style partitions - sector 0 contains partition info.
         */
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
                         * partition is.  We must not make this visible
                         * to the filesystem.
                         */
-                        if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+                        if (size > 1 && adfspart_check_ICSLinux(state, start)) {
                                start += 1;
                                size -= 1;
                        }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
 *          hda2 = ADFS partition 1 on first drive.
 *              ..etc..
 */
-int
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
        int slot = 1;
        int i;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
 *  1. The individual ADFS boot block entries that are placed on the disk.
 *  2. The start address of the next entry.
 */
-int
+int adfspart_check_EESOX(struct parsed_partitions *state)
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        sector_t start = 0;
        int i, slot = 1;
-        data = read_dev_sector(bdev, 7, &sect);
+        data = read_part_sector(state, 7, &sect);
        if (!data)
                return -1;
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
        if (i != 0) {
                sector_t size;
-                size = get_capacity(bdev->bd_disk);
+                size = get_capacity(state->bdev->bd_disk);
                put_partition(state, slot++, start, size - start);
                printk("\n");
        }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
 *  format, and everyone stick to it?
 */
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_ADFS(struct parsed_partitions *state);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_ICS(struct parsed_partitions *state);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
        return sum;
 }
-int
+int amiga_partition(struct parsed_partitions *state)
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
        Sector sect;
        unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        for (blk = 0; ; blk++, put_dev_sector(sect)) {
                if (blk == RDB_ALLOCATION_LIMIT)
                        goto rdb_done;
-                data = read_dev_sector(bdev, blk, &sect);
+                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read RDB block %d\n",
-                                       bdevname(bdev, b), blk);
+                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
                }
                printk("Dev %s: RDB in block %d has bad checksum\n",
-                               bdevname(bdev, b), blk);
+                       bdevname(state->bdev, b), blk);
        }
        /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
        put_dev_sector(sect);
        for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
                blk *= blksize; /* Read in terms partition table understands */
-                data = read_dev_sector(bdev, blk, &sect);
+                data = read_part_sector(state, blk, &sect);
                if (!data) {
                        if (warn_no_part)
                                printk("Dev %s: unable to read partition block %d\n",
-                                       bdevname(bdev, b), blk);
+                                       bdevname(state->bdev, b), blk);
                        res = -1;
                        goto rdb_done;
                }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
 *  fs/partitions/amiga.h
 */
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
                memcmp (s, "RAW", 3) == 0 ;
 }
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
 {
        Sector sect;
        struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
        int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
 #endif
-        rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+        rs = read_part_sector(state, 0, &sect);
        if (!rs)
                return -1;
        /* Verify this is an Atari rootsector: */
-        hd_size = bdev->bd_inode->i_size >> 9;
+        hd_size = state->bdev->bd_inode->i_size >> 9;
        if (!VALID_PARTITION(&rs->part[0], hd_size) &&
            !VALID_PARTITION(&rs->part[1], hd_size) &&
            !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
                printk(" XGM<");
                partsect = extensect = be32_to_cpu(pi->st);
                while (1) {
-                        xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+                        xrs = read_part_sector(state, partsect, &sect2);
                        if (!xrs) {
                                printk (" block %ld read failed\n", partsect);
                                put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
  u16 checksum;                 /* checksum for bootable disks */
 } __attribute__((__packed__));
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
        /*
         * Probe partition formats with tables at disk address 0
         * that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        struct parsed_partitions *state;
        int i, res, err;
-        state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+        state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
        if (!state)
                return NULL;
+        state->bdev = bdev;
        disk_name(hd, 0, state->name);
        printk(KERN_INFO " %s:", state->name);
        if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(&state->parts, 0, sizeof(state->parts));
-                res = check_part[i++](state, bdev);
+                res = check_part[i++](state);
                if (res < 0) {
                        /* We have hit an I/O error which we don't report now.
                        * But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        }
        if (res > 0)
                return state;
+        if (state->access_beyond_eod)
+                err = -ENOSPC;
        if (err)
        /* The partition is unrecognized. So report I/O errors if there were any */
                res = err;
@@ -538,12 +541,33 @@ exit:
        disk_part_iter_exit(&piter);
 }
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+        const struct block_device_operations *bdops = disk->fops;
+        if (bdops->unlock_native_capacity &&
+            !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+                printk(KERN_CONT "enabling native capacity\n");
+                bdops->unlock_native_capacity(disk);
+                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+                return true;
+        } else {
+                printk(KERN_CONT "truncated\n");
+                return false;
+        }
+}
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+        struct parsed_partitions *state = NULL;
        struct disk_part_iter piter;
        struct hd_struct *part;
-        struct parsed_partitions *state;
        int p, highest, res;
+rescan:
+        if (state && !IS_ERR(state)) {
+                kfree(state);
+                state = NULL;
+        }
        if (bdev->bd_part_count)
                return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
-        if (IS_ERR(state))      /* I/O error reading the partition table */
+        if (IS_ERR(state)) {
+                /*
+                 * I/O error reading the partition table.  If any
+                 * partition code tried to read beyond EOD, retry
+                 * after unlocking native capacity.
+                 */
+                if (PTR_ERR(state) == -ENOSPC) {
+                        printk(KERN_WARNING "%s: partition table beyond EOD, ",
+                               disk->disk_name);
+                        if (disk_unlock_native_capacity(disk))
+                                goto rescan;
+                }
                return -EIO;
+        }
+        /*
+         * If any partition code tried to read beyond EOD, try
+         * unlocking native capacity even if partition table is
+         * sucessfully read as we could be missing some partitions.
+         */
+        if (state->access_beyond_eod) {
+                printk(KERN_WARNING
+                       "%s: partition table partially beyond EOD, ",
+                       disk->disk_name);
+                if (disk_unlock_native_capacity(disk))
+                        goto rescan;
+        }
        /* tell userspace that the media / partition table may have changed */
        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size, from;
-try_scan:
                size = state->parts[p].size;
                if (!size)
                        continue;
@@ -589,30 +637,21 @@ try_scan:
                from = state->parts[p].from;
                if (from >= get_capacity(disk)) {
                        printk(KERN_WARNING
-                               "%s: p%d ignored, start %llu is behind the end of the disk\n",
+                               "%s: p%d start %llu is beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) from);
+                        if (disk_unlock_native_capacity(disk))
+                                goto rescan;
                        continue;
                }
                if (from + size > get_capacity(disk)) {
-                        const struct block_device_operations *bdops = disk->fops;
-                        unsigned long long capacity;
                        printk(KERN_WARNING
-                               "%s: p%d size %llu exceeds device capacity, ",
+                               "%s: p%d size %llu extends beyond EOD, ",
                               disk->disk_name, p, (unsigned long long) size);
-                        if (bdops->set_capacity &&
+                        if (disk_unlock_native_capacity(disk)) {
-                            (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+                                /* free state and restart */
-                                printk(KERN_CONT "enabling native capacity\n");
+                                goto rescan;
-                                capacity = bdops->set_capacity(disk, ~0ULL);
-                                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
-                                if (capacity > get_capacity(disk)) {
-                                        set_capacity(disk, capacity);
-                                        check_disk_size_change(disk, bdev);
-                                        bdev->bd_invalidated = 0;
-                                }
-                                goto try_scan;
                        } else {
                                /*
                                 * we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
                                 * we limit them to the end of the disk to avoid
                                 * creating invalid block devices
                                 */
-                                printk(KERN_CONT "limited to end of disk\n");
                                size = get_capacity(disk) - from;
                        }
                }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
 * description.
 */
 struct parsed_partitions {
+        struct block_device *bdev;
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
+        bool access_beyond_eod;
 };
+static inline void *read_part_sector(struct parsed_partitions *state,
+                                     sector_t n, Sector *p)
+{
+        if (n >= get_capacity(state->bdev->bd_disk)) {
+                state->access_beyond_eod = true;
+                return NULL;
+        }
+        return read_dev_sector(state->bdev, n, p);
+}
 static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
 *  the part[0] entry for this disk, and is the number of
 *  physical sectors available on the disk.
 */
-static u64
+static u64 last_lba(struct block_device *bdev)
-last_lba(struct block_device *bdev)
 {
        if (!bdev || !bdev->bd_inode)
                return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
 /**
 * read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
 * @lba
 * @buffer
 * @size_t
 *
- * Description:  Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
 * Returns number of bytes read on success, 0 on error.
 */
-static size_t
+static size_t read_lba(struct parsed_partitions *state,
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+                       u64 lba, u8 *buffer, size_t count)
 {
        size_t totalreadcount = 0;
+        struct block_device *bdev = state->bdev;
        sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
-        if (!bdev || !buffer || lba > last_lba(bdev))
+        if (!buffer || lba > last_lba(bdev))
                return 0;
        while (count) {
                int copied = 512;
                Sector sect;
-                unsigned char *data = read_dev_sector(bdev, n++, &sect);
+                unsigned char *data = read_part_sector(state, n++, &sect);
                if (!data)
                        break;
                if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
 /**
 * alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
 * @gpt - GPT header
 * 
 * Description: Returns ptes on success,  NULL on error.
 * Allocates space for PTEs based on information found in @gpt.
 * Notes: remember to free pte when you're done!
 */
-static gpt_entry *
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+                                         gpt_header *gpt)
 {
        size_t count;
        gpt_entry *pte;
-        if (!bdev || !gpt)
+        if (!gpt)
                return NULL;
        count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
        if (!pte)
                return NULL;
-        if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+        if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
                     (u8 *) pte,
                     count) < count) {
                kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
 /**
 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
 * @lba is the Logical Block Address of the partition table
 * 
 * Description: returns GPT header on success, NULL on error.   Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
 * Note: remember to free gpt when finished with it.
 */
-static gpt_header *
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+                                         u64 lba)
 {
        gpt_header *gpt;
-        unsigned ssz = bdev_logical_block_size(bdev);
+        unsigned ssz = bdev_logical_block_size(state->bdev);
-        if (!bdev)
-                return NULL;
        gpt = kzalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
-        if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+        if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
                kfree(gpt);
                gpt=NULL;
                return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 /**
 * is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
 * @lba is the logical block address of the GPT header to test
 * @gpt is a GPT header ptr, filled on return.
 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 * Description: returns 1 if valid,  0 on error.
 * If valid, returns pointers to newly allocated GPT header and PTEs.
 */
-static int
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
-is_gpt_valid(struct block_device *bdev, u64 lba,
+                        gpt_header **gpt, gpt_entry **ptes)
-             gpt_header **gpt, gpt_entry **ptes)
 {
        u32 crc, origcrc;
        u64 lastlba;
-        if (!bdev || !gpt || !ptes)
+        if (!ptes)
                return 0;
-        if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+        if (!(*gpt = alloc_read_gpt_header(state, lba)))
                return 0;
        /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        /* Check the first_usable_lba and last_usable_lba are
         * within the disk.
         */
-        lastlba = last_lba(bdev);
+        lastlba = last_lba(state->bdev);
        if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
                pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
                         (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
                goto fail;
        }
-        if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+        if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
                goto fail;
        /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 /**
 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
 * @gpt is a GPT header ptr, filled on return.
 * @ptes is a PTEs ptr, filled on return.
 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 * This protects against devices which misreport their size, and forces
 * the user to decide to use the Alternate GPT.
 */
-static int
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+                          gpt_entry **ptes)
 {
        int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
        gpt_header *pgpt = NULL, *agpt = NULL;
        gpt_entry *pptes = NULL, *aptes = NULL;
        legacy_mbr *legacymbr;
        u64 lastlba;
-        if (!bdev || !gpt || !ptes)
+        if (!ptes)
                return 0;
-        lastlba = last_lba(bdev);
+        lastlba = last_lba(state->bdev);
        if (!force_gpt) {
                /* This will be added to the EFI Spec. per Intel after v1.02. */
                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                if (legacymbr) {
-                        read_lba(bdev, 0, (u8 *) legacymbr,
+                        read_lba(state, 0, (u8 *) legacymbr,
-                                 sizeof (*legacymbr));
+                                 sizeof (*legacymbr));
                        good_pmbr = is_pmbr_valid(legacymbr);
                        kfree(legacymbr);
                }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
                        goto fail;
        }
-        good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+        good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
                                 &pgpt, &pptes);
        if (good_pgpt)
-                good_agpt = is_gpt_valid(bdev,
+                good_agpt = is_gpt_valid(state,
                                         le64_to_cpu(pgpt->alternate_lba),
                                         &agpt, &aptes);
        if (!good_agpt && force_gpt)
-                good_agpt = is_gpt_valid(bdev, lastlba,
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
-                                         &agpt, &aptes);
        /* The obviously unsuccessful case */
        if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 }
 /**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
 * @state
- * @bdev
 *
 * Description: called from check.c, if the disk contains GPT
 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 *  1 if successful
 *
 */
-int
+int efi_partition(struct parsed_partitions *state)
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
        gpt_header *gpt = NULL;
        gpt_entry *ptes = NULL;
        u32 i;
-        unsigned ssz = bdev_logical_block_size(bdev) / 512;
+        unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
-        if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+        if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
                kfree(gpt);
                kfree(ptes);
                return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                u64 size = le64_to_cpu(ptes[i].ending_lba) -
                           le64_to_cpu(ptes[i].starting_lba) + 1ULL;
-                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+                if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
                        continue;
                put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                /* If this is a RAID volume, tell md */
                if (!efi_guidcmp(ptes[i].partition_type_guid,
                                 PARTITION_LINUX_RAID_GUID))
-                        state->parts[i+1].flags = 1;
+                        state->parts[i + 1].flags = ADDPART_FLAG_RAID;
        }
        kfree(ptes);
        kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
 } __attribute__ ((packed)) legacy_mbr;
 /* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
 #endif
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 /*
 */
-int
+int ibm_partition(struct parsed_partitions *state)
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
+        struct block_device *bdev = state->bdev;
        int blocksize, res;
        loff_t i_size, offset, size, fmt_size;
        dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        /*
         * Get volume label, extract name and type.
         */
-        data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+        data = read_part_sector(state, info->label_block*(blocksize/512),
+                                &sect);
        if (data == NULL)
                goto out_readerr;
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                         */
                        blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
                        counter = 0;
-                        data = read_dev_sector(bdev, blk * (blocksize/512),
+                        data = read_part_sector(state, blk * (blocksize/512),
-                                               &sect);
+                                                &sect);
                        while (data != NULL) {
                                struct vtoc_format1_label f1;
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                    || f1.DS1FMTID == _ascebc['7']
                                    || f1.DS1FMTID == _ascebc['9']) {
                                        blk++;
-                                        data = read_dev_sector(bdev, blk *
+                                        data = read_part_sector(state,
-                                                               (blocksize/512),
+                                                blk * (blocksize/512), &sect);
-                                                                &sect);
                                        continue;
                                }
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
                                              size * (blocksize >> 9));
                                counter++;
                                blk++;
-                                data = read_dev_sector(bdev,
+                                data = read_part_sector(state,
-                                                       blk * (blocksize/512),
+                                                blk * (blocksize/512), &sect);
-                                                       &sect);
                        }
                        if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "karma.h"
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
        } __attribute__((packed)) *label;
        struct d_partition *p;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
 #define KARMA_LABEL_MAGIC               0xAB56
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include "ldm.h"
 #include "check.h"
 #include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
        int h;
        /* high part */
-        if      ((x = src[0] - '0') <= '9'-'0') h = x;
+        x = h = hex_to_bin(src[0]);
-        else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10;
+        if (h < 0)
-        else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10;
+                return -1;
-        else return -1;
-        h <<= 4;
        /* low part */
-        if ((x = src[1] - '0') <= '9'-'0') return h | x;
+        h = hex_to_bin(src[1]);
-        if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10);
+        if (h < 0)
-        if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10);
+                return -1;
-        return -1;
+        return (x << 4) + h;
 }
 /**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 /**
 * ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 * @ph1:   Memory struct to fill with ph contents
 *
 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 * Return:  'true'   Success
 *          'false'  Error
 */
-static bool ldm_validate_privheads (struct block_device *bdev,
+static bool ldm_validate_privheads(struct parsed_partitions *state,
-                                    struct privhead *ph1)
+                                   struct privhead *ph1)
 {
        static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
        struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
        long num_sects;
        int i;
-        BUG_ON (!bdev || !ph1);
+        BUG_ON (!state || !ph1);
        ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
        ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
        /* Read and parse privheads */
        for (i = 0; i < 3; i++) {
-                data = read_dev_sector (bdev,
+                data = read_part_sector(state, ph[0]->config_start + off[i],
-                        ph[0]->config_start + off[i], &sect);
+                                        &sect);
                if (!data) {
                        ldm_crit ("Disk read failed.");
                        goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
                }
        }
-        num_sects = bdev->bd_inode->i_size >> 9;
+        num_sects = state->bdev->bd_inode->i_size >> 9;
        if ((ph[0]->config_start > num_sects) ||
           ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
 /**
 * ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @base:  Offset, into @state->bdev, of the database
 * @ldb:   Cache of the database structures
 *
 * Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
 *
 * The offsets and sizes of the configs are range-checked against a privhead.
 *
 * Return:  'true'   @toc1 contains validated TOCBLOCK info
 *          'false'  @toc1 contents are undefined
 */
-static bool ldm_validate_tocblocks(struct block_device *bdev,
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
-        unsigned long base, struct ldmdb *ldb)
+                                   unsigned long base, struct ldmdb *ldb)
 {
        static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
        struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
        int i, nr_tbs;
        bool result = false;
-        BUG_ON(!bdev || !ldb);
+        BUG_ON(!state || !ldb);
        ph = &ldb->ph;
        tb[0] = &ldb->toc;
        tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
         * skip any that fail as long as we get at least one valid TOCBLOCK.
         */
        for (nr_tbs = i = 0; i < 4; i++) {
-                data = read_dev_sector(bdev, base + off[i], &sect);
+                data = read_part_sector(state, base + off[i], &sect);
                if (!data) {
                        ldm_error("Disk read failed for TOCBLOCK %d.", i);
                        continue;
@@ -473,7 +473,7 @@ err:
 /**
 * ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 * @base:  Offset, into @bdev, of the database
 * @ldb:   Cache of the database structures
 *
@@ -483,8 +483,8 @@ err:
 * Return:  'true'   @ldb contains validated VBDB info
 *          'false'  @ldb contents are undefined
 */
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
-                               struct ldmdb *ldb)
+                              unsigned long base, struct ldmdb *ldb)
 {
        Sector sect;
        u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
        struct vmdb *vm;
        struct tocblock *toc;
-        BUG_ON (!bdev || !ldb);
+        BUG_ON (!state || !ldb);
        vm  = &ldb->vm;
        toc = &ldb->toc;
-        data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
+        data = read_part_sector(state, base + OFF_VMDB, &sect);
        if (!data) {
                ldm_crit ("Disk read failed.");
                return false;
@@ -534,21 +534,21 @@ out:
 /**
 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
 *
 * This function provides a weak test to decide whether the device is a dynamic
 * disk or not.  It looks for an MS-DOS-style partition table containing at
 * least one partition of type 0x42 (formerly SFS, now used by Windows for
 * dynamic disks).
 *
- * N.B.  The only possible error can come from the read_dev_sector and that is
+ * N.B.  The only possible error can come from the read_part_sector and that is
 *       only likely to happen if the underlying device is strange.  If that IS
 *       the case we should return zero to let someone else try.
 *
- * Return:  'true'   @bdev is a dynamic disk
+ * Return:  'true'   @state->bdev is a dynamic disk
- *          'false'  @bdev is not a dynamic disk, or an error occurred
+ *          'false'  @state->bdev is not a dynamic disk, or an error occurred
 */
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
 {
        Sector sect;
        u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
        int i;
        bool result = false;
-        BUG_ON (!bdev);
+        BUG_ON(!state);
-        data = read_dev_sector (bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data) {
                ldm_crit ("Disk read failed.");
                return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 /**
 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev:  Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
- * @base:  Offset, into @bdev, of the database
+ * @base:  Offset, into @state->bdev, of the database
 * @ldb:   Cache of the database structures
 *
 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 * Return:  'true'   All the VBLKs were read successfully
 *          'false'  An error occurred
 */
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
-                           struct ldmdb *ldb)
+                          struct ldmdb *ldb)
 {
        int size, perbuf, skip, finish, s, v, recs;
        u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
        bool result = false;
        LIST_HEAD (frags);
-        BUG_ON (!bdev || !ldb);
+        BUG_ON(!state || !ldb);
        size   = ldb->vm.vblk_size;
        perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
        finish = (size * ldb->vm.last_vblk_seq) >> 9;
        for (s = skip; s < finish; s++) {               /* For each sector */
-                data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
+                data = read_part_sector(state, base + OFF_VMDB + s, &sect);
                if (!data) {
                        ldm_crit ("Disk read failed.");
                        goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
 /**
 * ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp:    List of the partitions parsed so far
+ * @state: Partition check state including device holding the LDM Database
- * @bdev:  Device holding the LDM Database
 *
 * This determines whether the device @bdev is a dynamic disk and if so creates
 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
 * and so on: the actual data containing partitions.
 *
- * Return:  1 Success, @bdev is a dynamic disk and we handled it
+ * Return:  1 Success, @state->bdev is a dynamic disk and we handled it
- *          0 Success, @bdev is not a dynamic disk
+ *          0 Success, @state->bdev is not a dynamic disk
 *         -1 An error occurred before enough information had been read
- *            Or @bdev is a dynamic disk, but it may be corrupted
+ *            Or @state->bdev is a dynamic disk, but it may be corrupted
 */
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
 {
        struct ldmdb  *ldb;
        unsigned long base;
        int result = -1;
-        BUG_ON (!pp || !bdev);
+        BUG_ON(!state);
        /* Look for signs of a Dynamic Disk */
-        if (!ldm_validate_partition_table (bdev))
+        if (!ldm_validate_partition_table(state))
                return 0;
        ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
        }
        /* Parse and check privheads. */
-        if (!ldm_validate_privheads (bdev, &ldb->ph))
+        if (!ldm_validate_privheads(state, &ldb->ph))
                goto out;               /* Already logged */
        /* All further references are relative to base (database start). */
        base = ldb->ph.config_start;
        /* Parse and check tocs and vmdb. */
-        if (!ldm_validate_tocblocks (bdev, base, ldb) ||
+        if (!ldm_validate_tocblocks(state, base, ldb) ||
-            !ldm_validate_vmdb      (bdev, base, ldb))
+            !ldm_validate_vmdb(state, base, ldb))
                goto out;               /* Already logged */
        /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
        INIT_LIST_HEAD (&ldb->v_comp);
        INIT_LIST_HEAD (&ldb->v_part);
-        if (!ldm_get_vblks (bdev, base, ldb)) {
+        if (!ldm_get_vblks(state, base, ldb)) {
                ldm_crit ("Failed to read the VBLKs from the database.");
                goto cleanup;
        }
        /* Finally, create the data partition devices. */
-        if (ldm_create_data_partitions (pp, ldb)) {
+        if (ldm_create_data_partitions(state, ldb)) {
                ldm_debug ("Parsed LDM database successfully.");
                result = 1;
        }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb {				/* Cache of the database */
        struct list_head v_part;
 };
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
 #endif /* _FS_PT_LDM_H_ */
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
                stg[i] = 0;
 }
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
 {
        int slot = 1;
        Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct mac_driver_desc *md;
        /* Get 0th block and look at the first partition map entry. */
-        md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
+        md = read_part_sector(state, 0, &sect);
        if (!md)
                return -1;
        if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        secsize = be16_to_cpu(md->block_size);
        put_dev_sector(sect);
-        data = read_dev_sector(bdev, secsize/512, &sect);
+        data = read_part_sector(state, secsize/512, &sect);
        if (!data)
                return -1;
        part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        for (blk = 1; blk <= blocks_in_map; ++blk) {
                int pos = blk * secsize;
                put_dev_sector(sect);
-                data = read_dev_sector(bdev, pos/512, &sect);
+                data = read_part_sector(state, pos/512, &sect);
                if (!data)
                        return -1;
                part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
                        be32_to_cpu(part->block_count) * (secsize/512));
                if (!strnicmp(part->type, "Linux_RAID", 10))
-                        state->parts[slot].flags = 1;
+                        state->parts[slot].flags = ADDPART_FLAG_RAID;
 #ifdef CONFIG_PPC_PMAC
                /*
                 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
 #ifdef CONFIG_PPC_PMAC
        if (found_root_goodness)
-                note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+                note_bootable_part(state->bdev->bd_dev, found_root,
+                                   found_root_goodness);
 #endif
        put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
    /* ... more stuff */
 };
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
 #define AIX_LABEL_MAGIC2        0xC2
 #define AIX_LABEL_MAGIC3        0xD4
 #define AIX_LABEL_MAGIC4        0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
 {
        struct partition *pt = (struct partition *) (p + 0x1be);
        Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
                        is_extended_partition(pt))
                        return 0;
        }
-        d = read_dev_sector(bdev, 7, &sect);
+        d = read_part_sector(state, 7, &sect);
        if (d) {
                if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
                        ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 * only for the actual data partitions.
 */
-static void
+static void parse_extended(struct parsed_partitions *state,
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
+                           sector_t first_sector, sector_t first_size)
-                        sector_t first_sector, sector_t first_size)
 {
        struct partition *p;
        Sector sect;
        unsigned char *data;
        sector_t this_sector, this_size;
-        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
                        return;
                if (state->next == state->limit)
                        return;
-                data = read_dev_sector(bdev, this_sector, &sect);
+                data = read_part_sector(state, this_sector, &sect);
                if (!data)
                        return;
@@ -198,9 +197,8 @@ done:
 /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
   indicates linux swap.  Be careful before believing this is Solaris. */
-static void
+static void parse_solaris_x86(struct parsed_partitions *state,
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
+                              sector_t offset, sector_t size, int origin)
-                        sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
        Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
        int i;
        short max_nparts;
-        v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
+        v = read_part_sector(state, offset + 1, &sect);
        if (!v)
                return;
        if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
 * Create devices for BSD partitions listed in a disklabel, under a
 * dos-like partition. See parse_extended() for more information.
 */
-static void
+static void parse_bsd(struct parsed_partitions *state,
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
+                      sector_t offset, sector_t size, int origin, char *flavour,
-                sector_t offset, sector_t size, int origin, char *flavour,
+                      int max_partitions)
-                int max_partitions)
 {
        Sector sect;
        struct bsd_disklabel *l;
        struct bsd_partition *p;
-        l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
+        l = read_part_sector(state, offset + 1, &sect);
        if (!l)
                return;
        if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 }
 #endif
-static void
+static void parse_freebsd(struct parsed_partitions *state,
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
+                          sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
-                        "bsd", BSD_MAXPARTITIONS);
 #endif
 }
-static void
+static void parse_netbsd(struct parsed_partitions *state,
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
+                         sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
-                        "netbsd", BSD_MAXPARTITIONS);
 #endif
 }
-static void
+static void parse_openbsd(struct parsed_partitions *state,
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
+                          sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
-        parse_bsd(state, bdev, offset, size, origin,
+        parse_bsd(state, offset, size, origin, "openbsd",
-                        "openbsd", OPENBSD_MAXPARTITIONS);
+                  OPENBSD_MAXPARTITIONS);
 #endif
 }
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
 * Create devices for Unixware partitions listed in a disklabel, under a
 * dos-like partition. See parse_extended() for more information.
 */
-static void
+static void parse_unixware(struct parsed_partitions *state,
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
+                           sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
        Sector sect;
        struct unixware_disklabel *l;
        struct unixware_slice *p;
-        l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
+        l = read_part_sector(state, offset + 29, &sect);
        if (!l)
                return;
        if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
 * Rajeev V. Pillai    <rajeevvp@yahoo.com>
 */
-static void
+static void parse_minix(struct parsed_partitions *state,
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
+                        sector_t offset, sector_t size, int origin)
-                sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
        Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
        struct partition *p;
        int i;
-        data = read_dev_sector(bdev, offset, &sect);
+        data = read_part_sector(state, offset, &sect);
        if (!data)
                return;
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
        unsigned char id;
-        void (*parse)(struct parsed_partitions *, struct block_device *,
+        void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
-                        sector_t, sector_t, int);
 } subtypes[] = {
        {FREEBSD_PARTITION, parse_freebsd},
        {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
        {0, NULL},
 };
 
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
 {
-        sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+        sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
        struct fat_boot_sector *fb;
        int slot;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
        if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                return 0;
        }
-        if (aix_magic_present(data, bdev)) {
+        if (aix_magic_present(state, data)) {
                put_dev_sector(sect);
                printk( " [AIX]");
                return 0;
@@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                        put_partition(state, slot, start, n);
                        printk(" <");
-                        parse_extended(state, bdev, start, size);
+                        parse_extended(state, start, size);
                        printk(" >");
                        continue;
                }
                put_partition(state, slot, start, size);
                if (SYS_IND(p) == LINUX_RAID_PARTITION)
-                        state->parts[slot].flags = 1;
+                        state->parts[slot].flags = ADDPART_FLAG_RAID;
                if (SYS_IND(p) == DM6_PARTITION)
                        printk("[DM]");
                if (SYS_IND(p) == EZD_PARTITION)
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                if (!subtypes[n].parse)
                        continue;
-                subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
+                subtypes[n].parse(state, start_sect(p) * sector_size,
-                                                nr_sects(p)*sector_size, slot);
+                                  nr_sects(p) * sector_size, slot);
        }
        put_dev_sector(sect);
        return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
 #define MSDOS_LABEL_MAGIC               0xAA55
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
        } * label;
        struct d_partition * partition;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
 #define DISKLABELMAGIC (0x82564557UL)
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
        __be32 _unused1;                        /* Padding */
 };
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
 {
        int i, csum;
        __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct sgi_partition *p;
        char b[BDEVNAME_SIZE];
-        label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
+        label = read_part_sector(state, 0, &sect);
        if (!label)
                return -1;
        p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
        }
        if(csum) {
                printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
-                       bdevname(bdev, b));
+                       bdevname(state->bdev, b));
                put_dev_sector(sect);
                return 0;
        }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
 *  fs/partitions/sgi.h
 */
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
 #define SGI_LABEL_MAGIC 0x0be5a941
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "sun.h"
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
 {
        int i;
        __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
        int use_vtoc;
        int nparts;
-        label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
+        label = read_part_sector(state, 0, &sect);
        if (!label)
                return -1;
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
                csum ^= *ush--;
        if (csum) {
                printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
-                       bdevname(bdev, b));
+                       bdevname(state->bdev, b));
                put_dev_sector(sect);
                return 0;
        }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
 #define SUN_LABEL_MAGIC          0xDABE
 #define SUN_VTOC_SANITY          0x600DDEEE
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
 };
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
 {
        int i, slices;
        int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
        struct dkblk0 *b;
        struct slice *slice;
-        data = read_dev_sector(bdev, 0, &sect);
+        data = read_part_sector(state, 0, &sect);
        if (!data)
                return -1;
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
        i = be32_to_cpu(b->dk_ios.ios_slcblk);
        put_dev_sector(sect);
-        data = read_dev_sector(bdev, i, &sect);
+        data = read_part_sector(state, i, &sect);
        if (!data)
                return -1;
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
 #include "check.h"
 #include "ultrix.h"
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
 {
        int i;
        Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
 #define PT_MAGIC        0x032957        /* Partition magic number */
 #define PT_VALID        1               /* Indicates if struct is valid */
-        data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
+        data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
        if (!data)
                return -1;
        
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
 *  fs/partitions/ultrix.h
 */
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..d79872eba09a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
@@ -18,11 +19,18 @@
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <linux/syscalls.h>
+#include <linux/fcntl.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 /*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+/*
 * We use a start+len construction, which provides full use of the 
 * allocated memory.
 * -- Florian Coosmann (FGC)
@@ -390,7 +398,7 @@ redo:
                        if (!buf->len) {
                                buf->ops = NULL;
                                ops->release(pipe, buf);
-                                curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+                                curbuf = (curbuf + 1) & (pipe->buffers - 1);
                                pipe->curbuf = curbuf;
                                pipe->nrbufs = --bufs;
                                do_wakeup = 1;
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
        chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
        if (pipe->nrbufs && chars != 0) {
                int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-                                                        (PIPE_BUFFERS-1);
+                                                        (pipe->buffers - 1);
                struct pipe_buffer *buf = pipe->bufs + lastbuf;
                const struct pipe_buf_operations *ops = buf->ops;
                int offset = buf->offset + buf->len;
@@ -518,8 +526,8 @@ redo1:
                        break;
                }
                bufs = pipe->nrbufs;
-                if (bufs < PIPE_BUFFERS) {
+                if (bufs < pipe->buffers) {
-                        int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+                        int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        struct page *page = pipe->tmp_page;
                        char *src;
@@ -580,7 +588,7 @@ redo2:
                        if (!total_len)
                                break;
                }
-                if (bufs < PIPE_BUFFERS)
+                if (bufs < pipe->buffers)
                        continue;
                if (filp->f_flags & O_NONBLOCK) {
                        if (!ret)
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        nrbufs = pipe->nrbufs;
                        while (--nrbufs >= 0) {
                                count += pipe->bufs[buf].len;
-                                buf = (buf+1) & (PIPE_BUFFERS-1);
+                                buf = (buf+1) & (pipe->buffers - 1);
                        }
                        mutex_unlock(&inode->i_mutex);
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait)
        }
        if (filp->f_mode & FMODE_WRITE) {
-                mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+                mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
                /*
                 * Most Unices do not set POLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
        if (pipe) {
-                init_waitqueue_head(&pipe->wait);
+                pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
-                pipe->r_counter = pipe->w_counter = 1;
+                if (pipe->bufs) {
-                pipe->inode = inode;
+                        init_waitqueue_head(&pipe->wait);
+                        pipe->r_counter = pipe->w_counter = 1;
+                        pipe->inode = inode;
+                        pipe->buffers = PIPE_DEF_BUFFERS;
+                        return pipe;
+                }
+                kfree(pipe);
        }
-        return pipe;
+        return NULL;
 }
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
        int i;
-        for (i = 0; i < PIPE_BUFFERS; i++) {
+        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        buf->ops->release(pipe, buf);
        }
        if (pipe->tmp_page)
                __free_page(pipe->tmp_page);
+        kfree(pipe->bufs);
        kfree(pipe);
 }
@@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 }
 /*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+        struct pipe_buffer *bufs;
+        /*
+         * Must be a power-of-2 currently
+         */
+        if (!is_power_of_2(arg))
+                return -EINVAL;
+        /*
+         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+         * expect a lot of shrink+grow operations, just free and allocate
+         * again like we would do for growing. If the pipe currently
+         * contains more buffers than arg, then return busy.
+         */
+        if (arg < pipe->nrbufs)
+                return -EBUSY;
+        bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+        if (unlikely(!bufs))
+                return -ENOMEM;
+        /*
+         * The pipe array wraps around, so just start the new one at zero
+         * and adjust the indexes.
+         */
+        if (pipe->nrbufs) {
+                const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+                const unsigned int head = pipe->nrbufs - tail;
+                if (head)
+                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+                if (tail)
+                        memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+        }
+        pipe->curbuf = 0;
+        kfree(pipe->bufs);
+        pipe->bufs = bufs;
+        pipe->buffers = arg;
+        return arg;
+}
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct pipe_inode_info *pipe;
+        long ret;
+        pipe = file->f_path.dentry->d_inode->i_pipe;
+        if (!pipe)
+                return -EBADF;
+        mutex_lock(&pipe->inode->i_mutex);
+        switch (cmd) {
+        case F_SETPIPE_SZ:
+                if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+                        return -EINVAL;
+                /*
+                 * The pipe needs to be at least 2 pages large to
+                 * guarantee POSIX behaviour.
+                 */
+                if (arg < 2)
+                        return -EINVAL;
+                ret = pipe_set_size(pipe, arg);
+                break;
+        case F_GETPIPE_SZ:
+                ret = pipe->buffers;
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        mutex_unlock(&pipe->inode->i_mutex);
+        return ret;
+}
+/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return err;
 }
+#ifdef CONFIG_HUGETLB_PAGE
 static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 {
        u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
        return err;
 }
+#endif /* HUGETLB_PAGE */
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
        pagemap_walk.mm = mm;
        pagemap_walk.private = &pm;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 788b5802a7ce..655a4c52b8c3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,7 +82,7 @@
 /*
 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats, dqstats structure containing statistics about the lists
+ * and quota formats.
 * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
 * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
 * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
@@ -132,7 +132,9 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
 static char *quotatypes[] = INITQFNAMES;
+#endif
 static struct quota_format_type *quota_formats; /* List of registered formats */
 static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
@@ -226,6 +228,10 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
+#ifdef CONFIG_SMP
+struct dqstats *dqstats_pcpu;
+EXPORT_SYMBOL(dqstats_pcpu);
+#endif
 static qsize_t inode_get_rsv_space(struct inode *inode);
 static void __dquot_initialize(struct inode *inode, int type);
@@ -273,7 +279,7 @@ static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
 static inline void put_dquot_last(struct dquot *dquot)
 {
        list_add_tail(&dquot->dq_free, &free_dquots);
-        dqstats.free_dquots++;
+        dqstats_inc(DQST_FREE_DQUOTS);
 }
 static inline void remove_free_dquot(struct dquot *dquot)
@@ -281,7 +287,7 @@ static inline void remove_free_dquot(struct dquot *dquot)
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
-        dqstats.free_dquots--;
+        dqstats_dec(DQST_FREE_DQUOTS);
 }
 static inline void put_inuse(struct dquot *dquot)
@@ -289,12 +295,12 @@ static inline void put_inuse(struct dquot *dquot)
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
-        dqstats.allocated_dquots++;
+        dqstats_inc(DQST_ALLOC_DQUOTS);
 }
 static inline void remove_inuse(struct dquot *dquot)
 {
-        dqstats.allocated_dquots--;
+        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
 }
 /*
@@ -317,14 +323,23 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
 }
+/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
 int dquot_mark_dquot_dirty(struct dquot *dquot)
 {
+        int ret = 1;
+        /* If quota is dirty already, we don't have to acquire dq_list_lock */
+        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+                return 1;
        spin_lock(&dq_list_lock);
-        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
+        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_type].dqi_dirty_list);
+                ret = 0;
+        }
        spin_unlock(&dq_list_lock);
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
@@ -550,8 +565,8 @@ int dquot_scan_active(struct super_block *sb,
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_LOOKUPS);
                dqput(old_dquot);
                old_dquot = dquot;
                ret = fn(dquot, priv);
@@ -596,8 +611,8 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
                         * holding reference so we can safely just increase
                         * use count */
                        atomic_inc(&dquot->dq_count);
-                        dqstats.lookups++;
                        spin_unlock(&dq_list_lock);
+                        dqstats_inc(DQST_LOOKUPS);
                        sb->dq_op->write_dquot(dquot);
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
@@ -609,9 +624,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
-        spin_lock(&dq_list_lock);
+        dqstats_inc(DQST_SYNCS);
-        dqstats.syncs++;
-        spin_unlock(&dq_list_lock);
        mutex_unlock(&dqopt->dqonoff_mutex);
        if (!wait || (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE))
@@ -663,6 +676,22 @@ static void prune_dqcache(int count)
        }
 }
+static int dqstats_read(unsigned int type)
+{
+        int count = 0;
+#ifdef CONFIG_SMP
+        int cpu;
+        for_each_possible_cpu(cpu)
+                count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
+        /* Statistics reading is racy, but absolute accuracy isn't required */
+        if (count < 0)
+                count = 0;
+#else
+        count = dqstats.stat[type];
+#endif
+        return count;
+}
 /*
 * This is called from kswapd when we think we need some
 * more memory
@@ -675,7 +704,7 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
                prune_dqcache(nr);
                spin_unlock(&dq_list_lock);
        }
-        return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure;
+        return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dqcache_shrinker = {
@@ -703,10 +732,7 @@ void dqput(struct dquot *dquot)
                BUG();
        }
 #endif
-        
+        dqstats_inc(DQST_DROPS);
-        spin_lock(&dq_list_lock);
-        dqstats.drops++;
-        spin_unlock(&dq_list_lock);
 we_slept:
        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
@@ -823,15 +849,15 @@ we_slept:
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
-                dqstats.cache_hits++;
-                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
+                dqstats_inc(DQST_CACHE_HITS);
+                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 1 test */
@@ -1677,16 +1703,19 @@ EXPORT_SYMBOL(dquot_free_inode);
 /*
 * Transfer the number of inode and blocks from one diskquota to an other.
+ * On success, dquot references in transfer_to are consumed and references
+ * to original dquots that need to be released are placed there. On failure,
+ * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
+ *
 */
-static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask)
+int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
        qsize_t space, cur_space;
        qsize_t rsv_space = 0;
-        struct dquot *transfer_from[MAXQUOTAS];
+        struct dquot *transfer_from[MAXQUOTAS] = {};
-        struct dquot *transfer_to[MAXQUOTAS];
        int cnt, ret = 0;
        char warntype_to[MAXQUOTAS];
        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
@@ -1696,19 +1725,12 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
        if (IS_NOQUOTA(inode))
                return 0;
        /* Initialize the arrays */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_from[cnt] = NULL;
-                transfer_to[cnt] = NULL;
                warntype_to[cnt] = QUOTA_NL_NOWARN;
-        }
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (mask & (1 << cnt))
-                        transfer_to[cnt] = dqget(inode->i_sb, chid[cnt], cnt);
-        }
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                goto put_all;
+                return 0;
        }
        spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
@@ -1760,47 +1782,41 @@ static int __dquot_transfer(struct inode *inode, qid_t *chid, unsigned long mask
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
-        /* The reference we got is transferred to the inode */
+        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_to[cnt] = NULL;
+                transfer_to[cnt] = transfer_from[cnt];
-warn_put_all:
+warn:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
-put_all:
-        dqput_all(transfer_from);
-        dqput_all(transfer_to);
        return ret;
 over_quota:
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Clear dquot pointers we don't want to dqput() */
+        goto warn;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                transfer_from[cnt] = NULL;
-        goto warn_put_all;
 }
+EXPORT_SYMBOL(__dquot_transfer);
 /* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-        qid_t chid[MAXQUOTAS];
+        struct dquot *transfer_to[MAXQUOTAS] = {};
-        unsigned long mask = 0;
+        struct super_block *sb = inode->i_sb;
+        int ret;
-        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) {
+        if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode))
-                mask |= 1 << USRQUOTA;
+                return 0;
-                chid[USRQUOTA] = iattr->ia_uid;
-        }
+        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
-        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) {
+                transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
-                mask |= 1 << GRPQUOTA;
+        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
-                chid[GRPQUOTA] = iattr->ia_gid;
+                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA);
-        }
-        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
+        ret = __dquot_transfer(inode, transfer_to);
-                dquot_initialize(inode);
+        dqput_all(transfer_to);
-                return __dquot_transfer(inode, chid, mask);
+        return ret;
-        }
-        return 0;
 }
 EXPORT_SYMBOL(dquot_transfer);
@@ -2275,25 +2291,30 @@ static inline qsize_t stoqb(qsize_t space)
 }
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
+        memset(di, 0, sizeof(*di));
+        di->d_version = FS_DQUOT_VERSION;
+        di->d_flags = dquot->dq_type == USRQUOTA ?
+                        XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+        di->d_id = dquot->dq_id;
        spin_lock(&dq_data_lock);
-        di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+        di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
-        di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
+        di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
-        di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
+        di->d_ino_hardlimit = dm->dqb_ihardlimit;
-        di->dqb_ihardlimit = dm->dqb_ihardlimit;
+        di->d_ino_softlimit = dm->dqb_isoftlimit;
-        di->dqb_isoftlimit = dm->dqb_isoftlimit;
+        di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
-        di->dqb_curinodes = dm->dqb_curinodes;
+        di->d_icount = dm->dqb_curinodes;
-        di->dqb_btime = dm->dqb_btime;
+        di->d_btimer = dm->dqb_btime;
-        di->dqb_itime = dm->dqb_itime;
+        di->d_itimer = dm->dqb_itime;
-        di->dqb_valid = QIF_ALL;
        spin_unlock(&dq_data_lock);
 }
 int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct if_dqblk *di)
+                  struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2307,51 +2328,70 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
 }
 EXPORT_SYMBOL(vfs_get_dqblk);
+#define VFS_FS_DQ_MASK \
+        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+         FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+         FS_DQ_BTIMER | FS_DQ_ITIMER)
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
+static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
-        if ((di->dqb_valid & QIF_BLIMITS &&
+        if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
-             (di->dqb_bhardlimit > dqi->dqi_maxblimit ||
+                return -EINVAL;
-              di->dqb_bsoftlimit > dqi->dqi_maxblimit)) ||
-            (di->dqb_valid & QIF_ILIMITS &&
+        if (((di->d_fieldmask & FS_DQ_BSOFT) &&
-             (di->dqb_ihardlimit > dqi->dqi_maxilimit ||
+             (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
-              di->dqb_isoftlimit > dqi->dqi_maxilimit)))
+            ((di->d_fieldmask & FS_DQ_BHARD) &&
+             (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+            ((di->d_fieldmask & FS_DQ_ISOFT) &&
+             (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+            ((di->d_fieldmask & FS_DQ_IHARD) &&
+             (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
                return -ERANGE;
        spin_lock(&dq_data_lock);
-        if (di->dqb_valid & QIF_SPACE) {
+        if (di->d_fieldmask & FS_DQ_BCOUNT) {
-                dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
+                dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BLIMITS) {
-                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+        if (di->d_fieldmask & FS_DQ_BSOFT)
-                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
+                dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+        if (di->d_fieldmask & FS_DQ_BHARD)
+                dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+        if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_INODES) {
-                dm->dqb_curinodes = di->dqb_curinodes;
+        if (di->d_fieldmask & FS_DQ_ICOUNT) {
+                dm->dqb_curinodes = di->d_icount;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_ILIMITS) {
-                dm->dqb_isoftlimit = di->dqb_isoftlimit;
+        if (di->d_fieldmask & FS_DQ_ISOFT)
-                dm->dqb_ihardlimit = di->dqb_ihardlimit;
+                dm->dqb_isoftlimit = di->d_ino_softlimit;
+        if (di->d_fieldmask & FS_DQ_IHARD)
+                dm->dqb_ihardlimit = di->d_ino_hardlimit;
+        if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BTIME) {
-                dm->dqb_btime = di->dqb_btime;
+        if (di->d_fieldmask & FS_DQ_BTIMER) {
+                dm->dqb_btime = di->d_btimer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_ITIME) {
-                dm->dqb_itime = di->dqb_itime;
+        if (di->d_fieldmask & FS_DQ_ITIMER) {
+                dm->dqb_itime = di->d_itimer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
@@ -2361,7 +2401,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
                    dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-                } else if (!(di->dqb_valid & QIF_BTIME))
+                } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
        }
@@ -2370,7 +2410,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
                    dm->dqb_curinodes < dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
-                } else if (!(di->dqb_valid & QIF_ITIME))
+                } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
        }
@@ -2386,7 +2426,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 }
 int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct if_dqblk *di)
+                  struct fs_disk_quota *di)
 {
        struct dquot *dquot;
        int rc;
@@ -2465,62 +2505,74 @@ const struct quotactl_ops vfs_quotactl_ops = {
        .set_dqblk      = vfs_set_dqblk
 };
+static int do_proc_dqstats(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+#ifdef CONFIG_SMP
+        /* Update global table */
+        unsigned int type = (int *)table->data - dqstats.stat;
+        dqstats.stat[type] = dqstats_read(type);
+#endif
+        return proc_dointvec(table, write, buffer, lenp, ppos);
+}
 static ctl_table fs_dqstats_table[] = {
        {
                .procname       = "lookups",
-                .data           = &dqstats.lookups,
+                .data           = &dqstats.stat[DQST_LOOKUPS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "drops",
-                .data           = &dqstats.drops,
+                .data           = &dqstats.stat[DQST_DROPS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "reads",
-                .data           = &dqstats.reads,
+                .data           = &dqstats.stat[DQST_READS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "writes",
-                .data           = &dqstats.writes,
+                .data           = &dqstats.stat[DQST_WRITES],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "cache_hits",
-                .data           = &dqstats.cache_hits,
+                .data           = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "allocated_dquots",
-                .data           = &dqstats.allocated_dquots,
+                .data           = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "free_dquots",
-                .data           = &dqstats.free_dquots,
+                .data           = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
        {
                .procname       = "syncs",
-                .data           = &dqstats.syncs,
+                .data           = &dqstats.stat[DQST_SYNCS],
                .maxlen         = sizeof(int),
                .mode           = 0444,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = do_proc_dqstats,
        },
 #ifdef CONFIG_PRINT_QUOTA_WARNING
        {
@@ -2572,6 +2624,13 @@ static int __init dquot_init(void)
        if (!dquot_hash)
                panic("Cannot create dquot hash table");
+#ifdef CONFIG_SMP
+        dqstats_pcpu = alloc_percpu(struct dqstats);
+        if (!dqstats_pcpu)
+                panic("Cannot create dquot stats table");
+#endif
+        memset(&dqstats, 0, sizeof(struct dqstats));
        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = 0;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95388f9b7356..ce3dfd066f59 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
        return security_quotactl(cmd, type, id, sb);
 }
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+        if (sb->s_qcop && sb->s_qcop->quota_sync)
+                sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
+}
 static int quota_sync_all(int type)
 {
-        struct super_block *sb;
        int ret;
        if (type >= MAXQUOTAS)
                return -EINVAL;
        ret = security_quotactl(Q_SYNC, type, 0, NULL);
-        if (ret)
+        if (!ret)
-                return ret;
+                iterate_supers(quota_sync_one, &type);
+        return ret;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_qcop || !sb->s_qcop->quota_sync)
-                        continue;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root)
-                        sb->s_qcop->quota_sync(sb, type, 1);
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        return 0;
 }
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -113,8 +99,6 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
        struct if_dqinfo info;
        int ret;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->get_info)
                return -ENOSYS;
        ret = sb->s_qcop->get_info(sb, type, &info);
@@ -129,43 +113,80 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
        if (copy_from_user(&info, addr, sizeof(info)))
                return -EFAULT;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->set_info)
                return -ENOSYS;
        return sb->s_qcop->set_info(sb, type, &info);
 }
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+{
+        dst->dqb_bhardlimit = src->d_blk_hardlimit;
+        dst->dqb_bsoftlimit = src->d_blk_softlimit;
+        dst->dqb_curspace = src->d_bcount;
+        dst->dqb_ihardlimit = src->d_ino_hardlimit;
+        dst->dqb_isoftlimit = src->d_ino_softlimit;
+        dst->dqb_curinodes = src->d_icount;
+        dst->dqb_btime = src->d_btimer;
+        dst->dqb_itime = src->d_itimer;
+        dst->dqb_valid = QIF_ALL;
+}
 static int quota_getquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
+        struct fs_disk_quota fdq;
        struct if_dqblk idq;
        int ret;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
-        ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
        if (ret)
                return ret;
+        copy_to_if_dqblk(&idq, &fdq);
        if (copy_to_user(addr, &idq, sizeof(idq)))
                return -EFAULT;
        return 0;
 }
+static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+{
+        dst->d_blk_hardlimit = src->dqb_bhardlimit;
+        dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+        dst->d_bcount = src->dqb_curspace;
+        dst->d_ino_hardlimit = src->dqb_ihardlimit;
+        dst->d_ino_softlimit = src->dqb_isoftlimit;
+        dst->d_icount = src->dqb_curinodes;
+        dst->d_btimer = src->dqb_btime;
+        dst->d_itimer = src->dqb_itime;
+        dst->d_fieldmask = 0;
+        if (src->dqb_valid & QIF_BLIMITS)
+                dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+        if (src->dqb_valid & QIF_SPACE)
+                dst->d_fieldmask |= FS_DQ_BCOUNT;
+        if (src->dqb_valid & QIF_ILIMITS)
+                dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+        if (src->dqb_valid & QIF_INODES)
+                dst->d_fieldmask |= FS_DQ_ICOUNT;
+        if (src->dqb_valid & QIF_BTIME)
+                dst->d_fieldmask |= FS_DQ_BTIMER;
+        if (src->dqb_valid & QIF_ITIME)
+                dst->d_fieldmask |= FS_DQ_ITIMER;
+}
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
+        struct fs_disk_quota fdq;
        struct if_dqblk idq;
        if (copy_from_user(&idq, addr, sizeof(idq)))
                return -EFAULT;
-        if (!sb_has_quota_active(sb, type))
-                return -ESRCH;
        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
-        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
+        copy_from_if_dqblk(&fdq, &idq);
+        return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
 }
 static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
@@ -199,9 +220,9 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
        if (copy_from_user(&fdq, addr, sizeof(fdq)))
                return -EFAULT;
-        if (!sb->s_qcop->set_xquota)
+        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
-        return sb->s_qcop->set_xquota(sb, type, id, &fdq);
+        return sb->s_qcop->set_dqblk(sb, type, id, &fdq);
 }
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
@@ -210,9 +231,9 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
        struct fs_disk_quota fdq;
        int ret;
-        if (!sb->s_qcop->get_xquota)
+        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
-        ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+        ret = sb->s_qcop->get_dqblk(sb, type, id, &fdq);
        if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index f81f4bcfb178..24f03407eeb5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -60,9 +60,17 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
        struct super_block *sb = info->dqi_sb;
+        ssize_t ret;
-        return sb->s_op->quota_write(sb, info->dqi_type, buf,
+        ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+        if (ret != info->dqi_usable_bs) {
+                q_warn(KERN_WARNING "VFS: dquota write failed on "
+                        "dev %s\n", sb->s_id);
+                if (ret >= 0)
+                        ret = -EIO;
+        }
+        return ret;
 }
 /* Remove empty block from list and return it */
@@ -152,7 +160,7 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
        /* No matter whether write succeeds block is out of list */
        if (write_blk(info, blk, buf) < 0)
-                printk(KERN_ERR
+                q_warn(KERN_ERR
                       "VFS: Can't write block (%u) with free entries.\n",
                       blk);
        return 0;
@@ -244,7 +252,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
                *err = remove_free_dqentry(info, buf, blk);
                if (*err < 0) {
-                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                        q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't "
                               "remove block (%u) from entry free list.\n",
                               blk);
                        goto out_buf;
@@ -268,7 +276,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 #endif
        *err = write_blk(info, blk, buf);
        if (*err < 0) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
                                "data block %u.\n", blk);
                goto out_buf;
        }
@@ -303,7 +311,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        } else {
                ret = read_blk(info, *treeblk, buf);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Can't read tree quota block "
+                        q_warn(KERN_ERR "VFS: Can't read tree quota block "
                                        "%u.\n", *treeblk);
                        goto out_buf;
                }
@@ -365,7 +373,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (!dquot->dq_off) {
                ret = dq_insert_tree(info, dquot);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Error %zd occurred while "
+                        q_warn(KERN_ERR "VFS: Error %zd occurred while "
                                        "creating quota.\n", ret);
                        kfree(ddquot);
                        return ret;
@@ -377,14 +385,14 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
                                    dquot->dq_off);
        if (ret != info->dqi_entry_size) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n",
                       sb->s_id);
                if (ret >= 0)
                        ret = -ENOSPC;
        } else {
                ret = 0;
        }
-        dqstats.writes++;
+        dqstats_inc(DQST_WRITES);
        kfree(ddquot);
        return ret;
@@ -402,14 +410,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
        if (!buf)
                return -ENOMEM;
        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
-                printk(KERN_ERR "VFS: Quota structure has offset to other "
+                q_warn(KERN_ERR "VFS: Quota structure has offset to other "
                  "block (%u) than it should (%u).\n", blk,
                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
                goto out_buf;
        }
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
                goto out_buf;
        }
        dh = (struct qt_disk_dqdbheader *)buf;
@@ -419,7 +427,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                if (ret >= 0)
                        ret = put_free_dqblk(info, buf, blk);
                if (ret < 0) {
-                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+                        q_warn(KERN_ERR "VFS: Can't move quota data block (%u) "
                          "to free list.\n", blk);
                        goto out_buf;
                }
@@ -432,14 +440,14 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                        /* Insert will write block itself */
                        ret = insert_free_dqentry(info, buf, blk);
                        if (ret < 0) {
-                                printk(KERN_ERR "VFS: Can't insert quota data "
+                                q_warn(KERN_ERR "VFS: Can't insert quota data "
                                       "block (%u) to free entry list.\n", blk);
                                goto out_buf;
                        }
                } else {
                        ret = write_blk(info, blk, buf);
                        if (ret < 0) {
-                                printk(KERN_ERR "VFS: Can't write quota data "
+                                q_warn(KERN_ERR "VFS: Can't write quota data "
                                  "block %u\n", blk);
                                goto out_buf;
                        }
@@ -464,7 +472,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                return -ENOMEM;
        ret = read_blk(info, *blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -488,7 +496,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                } else {
                        ret = write_blk(info, *blk, buf);
                        if (ret < 0)
-                                printk(KERN_ERR "VFS: Can't write quota tree "
+                                q_warn(KERN_ERR "VFS: Can't write quota tree "
                                  "block %u.\n", *blk);
                }
        }
@@ -521,7 +529,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
                goto out_buf;
        }
        ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -531,7 +539,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
                ddquot += info->dqi_entry_size;
        }
        if (i == qtree_dqstr_in_blk(info)) {
-                printk(KERN_ERR "VFS: Quota for id %u referenced "
+                q_warn(KERN_ERR "VFS: Quota for id %u referenced "
                  "but not present.\n", dquot->dq_id);
                ret = -EIO;
                goto out_buf;
@@ -556,7 +564,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
                return -ENOMEM;
        ret = read_blk(info, blk, buf);
        if (ret < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
                goto out_buf;
        }
        ret = 0;
@@ -599,7 +607,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
                offset = find_dqentry(info, dquot);
                if (offset <= 0) {      /* Entry not present? */
                        if (offset < 0)
-                                printk(KERN_ERR "VFS: Can't read quota "
+                                q_warn(KERN_ERR "VFS: Can't read quota "
                                  "structure for id %u.\n", dquot->dq_id);
                        dquot->dq_off = 0;
                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -617,7 +625,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        if (ret != info->dqi_entry_size) {
                if (ret >= 0)
                        ret = -EIO;
-                printk(KERN_ERR "VFS: Error while reading quota "
+                q_warn(KERN_ERR "VFS: Error while reading quota "
                                "structure for id %u.\n", dquot->dq_id);
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -634,7 +642,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
        spin_unlock(&dq_data_lock);
        kfree(ddquot);
 out:
-        dqstats.reads++;
+        dqstats_inc(DQST_READS);
        return ret;
 }
 EXPORT_SYMBOL(qtree_read_dquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db81a51..ccc3e71fb1d8 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,4 +22,10 @@ struct qt_disk_dqdbheader {
 #define QT_TREEOFF      1               /* Offset of tree in file in blocks */
+#define q_warn(fmt, args...) \
+do { \
+        if (printk_ratelimit()) \
+                printk(fmt, ## args); \
+} while(0)
 #endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 2ae757e9c008..4af344c5852a 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -71,7 +71,7 @@ static int v1_read_dqblk(struct dquot *dquot)
            dquot->dq_dqb.dqb_ihardlimit == 0 &&
            dquot->dq_dqb.dqb_isoftlimit == 0)
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
-        dqstats.reads++;
+        dqstats_inc(DQST_READS);
        return 0;
 }
@@ -104,7 +104,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
        ret = 0;
 out:
-        dqstats.writes++;
+        dqstats_inc(DQST_WRITES);
        return ret;
 }
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index e3da02f4986f..135206af1458 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,7 +63,7 @@ static int v2_read_header(struct super_block *sb, int type,
        size = sb->s_op->quota_read(sb, type, (char *)dqhead,
                                    sizeof(struct v2_disk_dqheader), 0);
        if (size != sizeof(struct v2_disk_dqheader)) {
-                printk(KERN_WARNING "quota_v2: Failed header read:"
+                q_warn(KERN_WARNING "quota_v2: Failed header read:"
                       " expected=%zd got=%zd\n",
                        sizeof(struct v2_disk_dqheader), size);
                return 0;
@@ -106,7 +106,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
+                q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n",
                        sb->s_id);
                return -1;
        }
@@ -167,7 +167,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
-                printk(KERN_WARNING "Can't write info structure on device %s.\n",
+                q_warn(KERN_WARNING "Can't write info structure on device %s.\n",
                        sb->s_id);
                return -1;
        }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c94853473ca9..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
                          BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
 };
-struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+struct inode *ramfs_get_inode(struct super_block *sb,
+                                const struct inode *dir, int mode, dev_t dev)
 {
        struct inode * inode = new_inode(sb);
        if (inode) {
-                inode->i_mode = mode;
+                inode_init_owner(inode, dir, mode);
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 static int
 ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-        struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
+        struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
        int error = -ENOSPC;
        if (inode) {
-                if (dir->i_mode & S_ISGID) {
-                        inode->i_gid = dir->i_gid;
-                        if (S_ISDIR(mode))
-                                inode->i_mode |= S_ISGID;
-                }
                d_instantiate(dentry, inode);
                dget(dentry);   /* Extra count - pin the dentry in core */
                error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
        struct inode *inode;
        int error = -ENOSPC;
-        inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+        inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
        if (inode) {
                int l = strlen(symname)+1;
                error = page_symlink(inode, symname, l);
                if (!error) {
-                        if (dir->i_mode & S_ISGID)
-                                inode->i_gid = dir->i_gid;
                        d_instantiate(dentry, inode);
                        dget(dentry);
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -214,7 +206,7 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
        return 0;
 }
-static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
+int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct ramfs_fs_info *fsi;
        struct inode *inode = NULL;
@@ -241,7 +233,7 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        sb->s_op                = &ramfs_ops;
        sb->s_time_gran         = 1;
-        inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+        inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
        if (!inode) {
                err = -ENOMEM;
                goto fail;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
        barrier_done = reiserfs_commit_for_inode(inode);
        reiserfs_write_unlock(inode->i_sb);
        if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
-                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, 
+                        BLKDEV_IFL_WAIT);
        if (barrier_done < 0)
                return barrier_done;
        return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index dc2c65e04853..0f22fdaf54ac 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3076,9 +3076,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
        depth = reiserfs_write_lock_once(inode->i_sb);
-        if (attr->ia_valid & ATTR_SIZE) {
+        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
+        if (attr->ia_valid & ATTR_SIZE) {
                /* version 2 items will be caught by the s_maxbytes check
                 ** done for us in vmtruncate
                 */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ffc..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
 */
 static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 {
-        /* the quota init calls have to know who to charge the quota to, so
-         ** we have to set uid and gid here
-         */
-        inode->i_uid = current_fsuid();
-        inode->i_mode = mode;
        /* Make inode invalid - just in case we are going to drop it before
         * the initialization happens */
        INODE_PKEY(inode)->k_objectid = 0;
+        /* the quota init calls have to know who to charge the quota to, so
-        if (dir->i_mode & S_ISGID) {
+         ** we have to set uid and gid here
-                inode->i_gid = dir->i_gid;
+         */
-                if (S_ISDIR(mode))
+        inode_init_owner(inode, dir, mode);
-                        inode->i_mode |= S_ISGID;
-        } else {
-                inode->i_gid = current_fsgid();
-        }
        dquot_initialize(inode);
        return 0;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636dc..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
                        (handler) = *(handlers)++)
 /* This is the implementation for the xattr plugin infrastructure */
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
-find_xattr_handler_prefix(struct xattr_handler **handlers,
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
                           const char *name)
 {
-        struct xattr_handler *xah;
+        const struct xattr_handler *xah;
        if (!handlers)
                return NULL;
@@ -748,7 +748,7 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
                  size_t size)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -767,7 +767,7 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
        size_t size;
        if (name[0] != '.' ||
            (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-                struct xattr_handler *handler;
+                const struct xattr_handler *handler;
                handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
                                                    name);
                if (!handler)   /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
 #endif
 /* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
 #ifdef CONFIG_REISERFS_FS_XATTR
        &reiserfs_xattr_user_handler,
        &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a9..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
        return size;
 }
-struct xattr_handler reiserfs_posix_acl_access_handler = {
+const struct xattr_handler reiserfs_posix_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags = ACL_TYPE_ACCESS,
        .get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
        return size;
 }
-struct xattr_handler reiserfs_posix_acl_default_handler = {
+const struct xattr_handler reiserfs_posix_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags = ACL_TYPE_DEFAULT,
        .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c041..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
        sec->value = NULL;
 }
-struct xattr_handler reiserfs_xattr_security_handler = {
+const struct xattr_handler reiserfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = security_get,
        .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
        return len;
 }
-struct xattr_handler reiserfs_xattr_trusted_handler = {
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = trusted_get,
        .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
        return len;
 }
-struct xattr_handler reiserfs_xattr_user_handler = {
+const struct xattr_handler reiserfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = user_get,
        .set = user_set,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..6c978428892d 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -39,7 +39,7 @@ const struct file_operations smb_dir_operations =
 {
        .read           = generic_read_dir,
        .readdir        = smb_readdir,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .open           = smb_dir_open,
 };
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..84ecf0e43f91 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -437,7 +437,7 @@ const struct file_operations smb_file_operations =
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
        .aio_write      = smb_file_aio_write,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .mmap           = smb_file_mmap,
        .open           = smb_file_open,
        .release        = smb_file_release,
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/net.h>
 #include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
 #include "proto.h"
-int
+long
-smb_ioctl(struct inode *inode, struct file *filp,
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-          unsigned int cmd, unsigned long arg)
 {
-        struct smb_sb_info *server = server_from_inode(inode);
+        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
        struct smb_conn_opt opt;
        int result = -EINVAL;
+        lock_kernel();
        switch (cmd) {
                uid16_t uid16;
                uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
        default:
                break;
        }
+        unlock_kernel();
        return result;
 }
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern const struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
-extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 /* smbiod.c */
 extern void smbiod_wake_up(void);
 extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
-#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..ac22b00d86c3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                        break;
                }
-                if (pipe->nrbufs < PIPE_BUFFERS) {
+                if (pipe->nrbufs < pipe->buffers) {
-                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+                        int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
                        struct pipe_buffer *buf = pipe->bufs + newbuf;
                        buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                        if (!--spd->nr_pages)
                                break;
-                        if (pipe->nrbufs < PIPE_BUFFERS)
+                        if (pipe->nrbufs < pipe->buffers)
                                continue;
                        break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
        page_cache_release(spd->pages[i]);
 }
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+        if (pipe->buffers <= PIPE_DEF_BUFFERS)
+                return 0;
+        spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+        spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+        if (spd->pages && spd->partial)
+                return 0;
+        kfree(spd->pages);
+        kfree(spd->partial);
+        return -ENOMEM;
+}
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+                       struct splice_pipe_desc *spd)
+{
+        if (pipe->buffers <= PIPE_DEF_BUFFERS)
+                return;
+        kfree(spd->pages);
+        kfree(spd->partial);
+}
 static int
 __generic_file_splice_read(struct file *in, loff_t *ppos,
                           struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 {
        struct address_space *mapping = in->f_mapping;
        unsigned int loff, nr_pages, req_pages;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct page *page;
        pgoff_t index, end_index;
        loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                .spd_release = spd_release_page,
        };
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
        index = *ppos >> PAGE_CACHE_SHIFT;
        loff = *ppos & ~PAGE_CACHE_MASK;
        req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+        nr_pages = min(req_pages, pipe->buffers);
        /*
         * Lookup the (hopefully) full range of pages we need.
         */
-        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+        spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
        index += spd.nr_pages;
        /*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                        unlock_page(page);
                }
-                pages[spd.nr_pages++] = page;
+                spd.pages[spd.nr_pages++] = page;
                index++;
        }
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                 * this_len is the max we'll use from this page
                 */
                this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
-                page = pages[page_nr];
+                page = spd.pages[page_nr];
                if (PageReadahead(page))
                        page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                        error = -ENOMEM;
                                        break;
                                }
-                                page_cache_release(pages[page_nr]);
+                                page_cache_release(spd.pages[page_nr]);
-                                pages[page_nr] = page;
+                                spd.pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
                        len = this_len;
                }
-                partial[page_nr].offset = loff;
+                spd.partial[page_nr].offset = loff;
-                partial[page_nr].len = this_len;
+                spd.partial[page_nr].len = this_len;
                len -= this_len;
                loff = 0;
                spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
         * we got, 'nr_pages' is how many pages are in the map.
         */
        while (page_nr < nr_pages)
-                page_cache_release(pages[page_nr++]);
+                page_cache_release(spd.pages[page_nr++]);
        in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
        if (spd.nr_pages)
-                return splice_to_pipe(pipe, &spd);
+                error = splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
        return error;
 }
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        unsigned int nr_pages;
        unsigned int nr_freed;
        size_t offset;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
-        struct iovec vec[PIPE_BUFFERS];
+        struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
        pgoff_t index;
        ssize_t res;
        size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                .spd_release = spd_release_page,
        };
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
+        res = -ENOMEM;
+        vec = __vec;
+        if (pipe->buffers > PIPE_DEF_BUFFERS) {
+                vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+                if (!vec)
+                        goto shrink_ret;
+        }
        index = *ppos >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+        for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
                struct page *page;
                page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
                vec[i].iov_base = (void __user *) page_address(page);
                vec[i].iov_len = this_len;
-                pages[i] = page;
+                spd.pages[i] = page;
                spd.nr_pages++;
                len -= this_len;
                offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        nr_freed = 0;
        for (i = 0; i < spd.nr_pages; i++) {
                this_len = min_t(size_t, vec[i].iov_len, res);
-                partial[i].offset = 0;
+                spd.partial[i].offset = 0;
-                partial[i].len = this_len;
+                spd.partial[i].len = this_len;
                if (!this_len) {
-                        __free_page(pages[i]);
+                        __free_page(spd.pages[i]);
-                        pages[i] = NULL;
+                        spd.pages[i] = NULL;
                        nr_freed++;
                }
                res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
        if (res > 0)
                *ppos += res;
+shrink_ret:
+        if (vec != __vec)
+                kfree(vec);
+        splice_shrink_spd(pipe, &spd);
        return res;
 err:
        for (i = 0; i < spd.nr_pages; i++)
-                __free_page(pages[i]);
+                __free_page(spd.pages[i]);
-        return error;
+        res = error;
+        goto shrink_ret;
 }
 EXPORT_SYMBOL(default_file_splice_read);
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                if (!buf->len) {
                        buf->ops = NULL;
                        ops->release(pipe, buf);
-                        pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
                        pipe->nrbufs--;
                        if (pipe->inode)
                                sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
         * If we did an incomplete transfer we must release
         * the pipe buffers in question:
         */
-        for (i = 0; i < PIPE_BUFFERS; i++) {
+        for (i = 0; i < pipe->buffers; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 */
 static int get_iovec_page_array(const struct iovec __user *iov,
                                unsigned int nr_vecs, struct page **pages,
-                                struct partial_page *partial, int aligned)
+                                struct partial_page *partial, int aligned,
+                                unsigned int pipe_buffers)
 {
        int buffers = 0, error = 0;
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                        break;
                npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                if (npages > PIPE_BUFFERS - buffers)
+                if (npages > pipe_buffers - buffers)
-                        npages = PIPE_BUFFERS - buffers;
+                        npages = pipe_buffers - buffers;
                error = get_user_pages_fast((unsigned long)base, npages,
                                        0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                 * or if we mapped the max number of pages that we have
                 * room for.
                 */
-                if (error < npages || buffers == PIPE_BUFFERS)
+                if (error < npages || buffers == pipe_buffers)
                        break;
                nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                             unsigned long nr_segs, unsigned int flags)
 {
        struct pipe_inode_info *pipe;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
                .ops = &user_page_pipe_buf_ops,
                .spd_release = spd_release_page,
        };
+        long ret;
        pipe = pipe_info(file->f_path.dentry->d_inode);
        if (!pipe)
                return -EBADF;
-        spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
+        if (splice_grow_spd(pipe, &spd))
-                                            flags & SPLICE_F_GIFT);
+                return -ENOMEM;
+        spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+                                            spd.partial, flags & SPLICE_F_GIFT,
+                                            pipe->buffers);
        if (spd.nr_pages <= 0)
-                return spd.nr_pages;
+                ret = spd.nr_pages;
+        else
+                ret = splice_to_pipe(pipe, &spd);
-        return splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
+        return ret;
 }
 /*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
         * Check ->nrbufs without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
-        if (pipe->nrbufs < PIPE_BUFFERS)
+        if (pipe->nrbufs < pipe->buffers)
                return 0;
        ret = 0;
        pipe_lock(pipe);
-        while (pipe->nrbufs >= PIPE_BUFFERS) {
+        while (pipe->nrbufs >= pipe->buffers) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
                 * Cannot make any progress, because either the input
                 * pipe is empty or the output pipe is full.
                 */
-                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+                if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
                        /* Already processed some buffers, break */
                        if (ret)
                                break;
@@ -1831,7 +1890,7 @@ retry:
                }
                ibuf = ipipe->bufs + ipipe->curbuf;
-                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                obuf = opipe->bufs + nbuf;
                if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        opipe->nrbufs++;
-                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+                        ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
                        ipipe->nrbufs--;
                        input_wakeup = true;
                } else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
                 * If we have iterated all input buffers or ran out of
                 * output room, break.
                 */
-                if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+                if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
                        break;
-                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+                ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
-                nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+                nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
                /*
                 * Get a reference to this pipe buffer,
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        int retval = -ENODEV;
+        if (dentry) {
+                retval = -ENOSYS;
+                if (dentry->d_sb->s_op->statfs) {
+                        memset(buf, 0, sizeof(*buf));
+                        retval = security_sb_statfs(dentry);
+                        if (retval)
+                                return retval;
+                        retval = dentry->d_sb->s_op->statfs(dentry, buf);
+                        if (retval == 0 && buf->f_frsize == 0)
+                                buf->f_frsize = buf->f_bsize;
+                }
+        }
+        return retval;
+}
+EXPORT_SYMBOL(vfs_statfs);
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+{
+        struct kstatfs st;
+        int retval;
+        retval = vfs_statfs(dentry, &st);
+        if (retval)
+                return retval;
+        if (sizeof(*buf) == sizeof(st))
+                memcpy(buf, &st, sizeof(st));
+        else {
+                if (sizeof buf->f_blocks == 4) {
+                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                             st.f_bsize | st.f_frsize) &
+                            0xffffffff00000000ULL)
+                                return -EOVERFLOW;
+                        /*
+                         * f_files and f_ffree may be -1; it's okay to stuff
+                         * that into 32 bits
+                         */
+                        if (st.f_files != -1 &&
+                            (st.f_files & 0xffffffff00000000ULL))
+                                return -EOVERFLOW;
+                        if (st.f_ffree != -1 &&
+                            (st.f_ffree & 0xffffffff00000000ULL))
+                                return -EOVERFLOW;
+                }
+                buf->f_type = st.f_type;
+                buf->f_bsize = st.f_bsize;
+                buf->f_blocks = st.f_blocks;
+                buf->f_bfree = st.f_bfree;
+                buf->f_bavail = st.f_bavail;
+                buf->f_files = st.f_files;
+                buf->f_ffree = st.f_ffree;
+                buf->f_fsid = st.f_fsid;
+                buf->f_namelen = st.f_namelen;
+                buf->f_frsize = st.f_frsize;
+                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+        }
+        return 0;
+}
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+{
+        struct kstatfs st;
+        int retval;
+        retval = vfs_statfs(dentry, &st);
+        if (retval)
+                return retval;
+        if (sizeof(*buf) == sizeof(st))
+                memcpy(buf, &st, sizeof(st));
+        else {
+                buf->f_type = st.f_type;
+                buf->f_bsize = st.f_bsize;
+                buf->f_blocks = st.f_blocks;
+                buf->f_bfree = st.f_bfree;
+                buf->f_bavail = st.f_bavail;
+                buf->f_files = st.f_files;
+                buf->f_ffree = st.f_ffree;
+                buf->f_fsid = st.f_fsid;
+                buf->f_namelen = st.f_namelen;
+                buf->f_frsize = st.f_frsize;
+                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+        }
+        return 0;
+}
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+        struct path path;
+        int error;
+        error = user_path(pathname, &path);
+        if (!error) {
+                struct statfs tmp;
+                error = vfs_statfs_native(path.dentry, &tmp);
+                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                        error = -EFAULT;
+                path_put(&path);
+        }
+        return error;
+}
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+        struct path path;
+        long error;
+        if (sz != sizeof(*buf))
+                return -EINVAL;
+        error = user_path(pathname, &path);
+        if (!error) {
+                struct statfs64 tmp;
+                error = vfs_statfs64(path.dentry, &tmp);
+                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                        error = -EFAULT;
+                path_put(&path);
+        }
+        return error;
+}
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+        struct file *file;
+        struct statfs tmp;
+        int error;
+        error = -EBADF;
+        file = fget(fd);
+        if (!file)
+                goto out;
+        error = vfs_statfs_native(file->f_path.dentry, &tmp);
+        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                error = -EFAULT;
+        fput(file);
+out:
+        return error;
+}
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+        struct file *file;
+        struct statfs64 tmp;
+        int error;
+        if (sz != sizeof(*buf))
+                return -EINVAL;
+        error = -EBADF;
+        file = fget(fd);
+        if (!file)
+                goto out;
+        error = vfs_statfs64(file->f_path.dentry, &tmp);
+        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+                error = -EFAULT;
+        fput(file);
+out:
+        return error;
+}
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+        struct super_block *s;
+        struct ustat tmp;
+        struct kstatfs sbuf;
+        int err;
+        s = user_get_super(new_decode_dev(dev));
+        if (!s)
+                return -EINVAL;
+        err = vfs_statfs(s->s_root, &sbuf);
+        drop_super(s);
+        if (err)
+                return err;
+        memset(&tmp,0,sizeof(struct ustat));
+        tmp.f_tfree = sbuf.f_bfree;
+        tmp.f_tinode = sbuf.f_ffree;
+        return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee35..69688b15f1fa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,15 @@
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/smp_lock.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
-#include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vfs.h>
 #include <linux/writeback.h>            /* for the emergency remount stuff */
 #include <linux/idr.h>
-#include <linux/kobject.h>
 #include <linux/mutex.h>
-#include <linux/file.h>
 #include <linux/backing-dev.h>
-#include <asm/uaccess.h>
 #include "internal.h"
@@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type)
                 * subclass.
                 */
                down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
-                s->s_count = S_BIAS;
+                s->s_count = 1;
                atomic_set(&s->s_active, 1);
                mutex_init(&s->s_vfs_rename_mutex);
+                lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
                mutex_init(&s->s_dquot.dqio_mutex);
                mutex_init(&s->s_dquot.dqonoff_mutex);
                init_rwsem(&s->s_dquot.dqptr_sem);
@@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s)
 /* Superblock refcounting  */
 /*
- * Drop a superblock's refcount.  Returns non-zero if the superblock was
+ * Drop a superblock's refcount.  The caller must hold sb_lock.
- * destroyed.  The caller must hold sb_lock.
 */
-static int __put_super(struct super_block *sb)
+void __put_super(struct super_block *sb)
 {
-        int ret = 0;
        if (!--sb->s_count) {
+                list_del_init(&sb->s_list);
                destroy_super(sb);
-                ret = 1;
        }
-        return ret;
-}
-/*
- * Drop a superblock's refcount.
- * Returns non-zero if the superblock is about to be destroyed and
- * at least is already removed from super_blocks list, so if we are
- * making a loop through super blocks then we need to restart.
- * The caller must hold sb_lock.
- */
-int __put_super_and_need_restart(struct super_block *sb)
-{
-        /* check for race with generic_shutdown_super() */
-        if (list_empty(&sb->s_list)) {
-                /* super block is removed, need to restart... */
-                __put_super(sb);
-                return 1;
-        }
-        /* can't be the last, since s_list is still in use */
-        sb->s_count--;
-        BUG_ON(sb->s_count == 0);
-        return 0;
 }
 /**
@@ -178,57 +146,48 @@ void put_super(struct super_block *sb)
 /**
- *      deactivate_super        -       drop an active reference to superblock
+ *      deactivate_locked_super -       drop an active reference to superblock
 *      @s: superblock to deactivate
 *
- *      Drops an active reference to superblock, acquiring a temprory one if
+ *      Drops an active reference to superblock, converting it into a temprory
- *      there is no active references left.  In that case we lock superblock,
+ *      one if there is no other active references left.  In that case we
 *      tell fs driver to shut it down and drop the temporary reference we
 *      had just acquired.
+ *
+ *      Caller holds exclusive lock on superblock; that lock is released.
 */
-void deactivate_super(struct super_block *s)
+void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
-        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+        if (atomic_dec_and_test(&s->s_active)) {
-                s->s_count -= S_BIAS-1;
-                spin_unlock(&sb_lock);
                vfs_dq_off(s, 0);
-                down_write(&s->s_umount);
                fs->kill_sb(s);
                put_filesystem(fs);
                put_super(s);
+        } else {
+                up_write(&s->s_umount);
        }
 }
-EXPORT_SYMBOL(deactivate_super);
+EXPORT_SYMBOL(deactivate_locked_super);
 /**
- *      deactivate_locked_super -       drop an active reference to superblock
+ *      deactivate_super        -       drop an active reference to superblock
 *      @s: superblock to deactivate
 *
- *      Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
+ *      Variant of deactivate_locked_super(), except that superblock is *not*
- *      it does not unlock it until it's all over.  As the result, it's safe to
+ *      locked by caller.  If we are going to drop the final active reference,
- *      use to dispose of new superblock on ->get_sb() failure exits - nobody
+ *      lock will be acquired prior to that.
- *      will see the sucker until it's all over.  Equivalent using up_write +
- *      deactivate_super is safe for that purpose only if superblock is either
- *      safe to use or has NULL ->s_root when we unlock.
 */
-void deactivate_locked_super(struct super_block *s)
+void deactivate_super(struct super_block *s)
 {
-        struct file_system_type *fs = s->s_type;
+        if (!atomic_add_unless(&s->s_active, -1, 1)) {
-        if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
+                down_write(&s->s_umount);
-                s->s_count -= S_BIAS-1;
+                deactivate_locked_super(s);
-                spin_unlock(&sb_lock);
-                vfs_dq_off(s, 0);
-                fs->kill_sb(s);
-                put_filesystem(fs);
-                put_super(s);
-        } else {
-                up_write(&s->s_umount);
        }
 }
-EXPORT_SYMBOL(deactivate_locked_super);
+EXPORT_SYMBOL(deactivate_super);
 /**
 *      grab_super - acquire an active reference
@@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
 */
 static int grab_super(struct super_block *s) __releases(sb_lock)
 {
+        if (atomic_inc_not_zero(&s->s_active)) {
+                spin_unlock(&sb_lock);
+                return 1;
+        }
+        /* it's going away */
        s->s_count++;
        spin_unlock(&sb_lock);
+        /* wait for it to die */
        down_write(&s->s_umount);
-        if (s->s_root) {
-                spin_lock(&sb_lock);
-                if (s->s_count > S_BIAS) {
-                        atomic_inc(&s->s_active);
-                        s->s_count--;
-                        spin_unlock(&sb_lock);
-                        return 1;
-                }
-                spin_unlock(&sb_lock);
-        }
        up_write(&s->s_umount);
        put_super(s);
-        yield();
        return 0;
 }
@@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb)
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
-        list_del_init(&sb->s_list);
+        list_del_init(&sb->s_instances);
-        list_del(&sb->s_instances);
        spin_unlock(&sb_lock);
        up_write(&sb->s_umount);
 }
@@ -357,6 +310,7 @@ retry:
                                up_write(&s->s_umount);
                                destroy_super(s);
                        }
+                        down_write(&old->s_umount);
                        return old;
                }
        }
@@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super);
 */
 void sync_supers(void)
 {
-        struct super_block *sb;
+        struct super_block *sb, *n;
        spin_lock(&sb_lock);
-restart:
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
-        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_op->write_super && sb->s_dirt) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
@@ -423,14 +378,43 @@ restart:
                        up_read(&sb->s_umount);
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto restart;
                }
        }
        spin_unlock(&sb_lock);
 }
 /**
+ *      iterate_supers - call function for all active superblocks
+ *      @f: function to call
+ *      @arg: argument to pass to it
+ *
+ *      Scans the superblock list and calls given function, passing it
+ *      locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+        struct super_block *sb, *n;
+        spin_lock(&sb_lock);
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (sb->s_root)
+                        f(sb, arg);
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                __put_super(sb);
+        }
+        spin_unlock(&sb_lock);
+}
+/**
 *      get_super - get the superblock of a device
 *      @bdev: device to get the superblock for
 *      
@@ -438,7 +422,7 @@ restart:
 *      mounted on the device given. %NULL is returned if no match is found.
 */
-struct super_block * get_super(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
 {
        struct super_block *sb;
@@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev)
        spin_lock(&sb_lock);
 rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_bdev == bdev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
+                        /* still alive? */
                        if (sb->s_root)
                                return sb;
                        up_read(&sb->s_umount);
-                        /* restart only when sb is no longer on the list */
+                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto rescan;
+                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
@@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super);
 *
 * Scans the superblock list and finds the superblock of the file system
 * mounted on the device given.  Returns the superblock with an active
- * reference and s_umount held exclusively or %NULL if none was found.
+ * reference or %NULL if none was found.
 */
 struct super_block *get_active_super(struct block_device *bdev)
 {
@@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev)
        if (!bdev)
                return NULL;
+restart:
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_bdev != bdev)
+                if (list_empty(&sb->s_instances))
                        continue;
+                if (sb->s_bdev == bdev) {
-                sb->s_count++;
+                        if (grab_super(sb)) /* drops sb_lock */
-                spin_unlock(&sb_lock);
-                down_write(&sb->s_umount);
-                if (sb->s_root) {
-                        spin_lock(&sb_lock);
-                        if (sb->s_count > S_BIAS) {
-                                atomic_inc(&sb->s_active);
-                                sb->s_count--;
-                                spin_unlock(&sb_lock);
                                return sb;
-                        }
+                        else
-                        spin_unlock(&sb_lock);
+                                goto restart;
                }
-                up_write(&sb->s_umount);
-                put_super(sb);
-                yield();
-                spin_lock(&sb_lock);
        }
        spin_unlock(&sb_lock);
        return NULL;
 }
 
-struct super_block * user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev)
 {
        struct super_block *sb;
        spin_lock(&sb_lock);
 rescan:
        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                if (sb->s_dev ==  dev) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
+                        /* still alive? */
                        if (sb->s_root)
                                return sb;
                        up_read(&sb->s_umount);
-                        /* restart only when sb is no longer on the list */
+                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        __put_super(sb);
-                                goto rescan;
+                        goto rescan;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
 }
-SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
-{
-        struct super_block *s;
-        struct ustat tmp;
-        struct kstatfs sbuf;
-        int err = -EINVAL;
-        s = user_get_super(new_decode_dev(dev));
-        if (s == NULL)
-                goto out;
-        err = vfs_statfs(s->s_root, &sbuf);
-        drop_super(s);
-        if (err)
-                goto out;
-        memset(&tmp,0,sizeof(struct ustat));
-        tmp.f_tfree = sbuf.f_bfree;
-        tmp.f_tinode = sbuf.f_ffree;
-        err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
-out:
-        return err;
-}
 /**
 *      do_remount_sb - asks filesystem to change mount options.
 *      @sb:    superblock in question
@@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 static void do_emergency_remount(struct work_struct *work)
 {
-        struct super_block *sb;
+        struct super_block *sb, *n;
        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
+        list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+                if (list_empty(&sb->s_instances))
+                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_write(&sb->s_umount);
                if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
                        /*
-                         * ->remount_fs needs lock_kernel().
-                         *
                         * What lock protects sb->s_flags??
                         */
                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
                }
                up_write(&sb->s_umount);
-                put_super(sb);
                spin_lock(&sb_lock);
+                __put_super(sb);
        }
        spin_unlock(&sb_lock);
        kfree(work);
@@ -990,6 +945,96 @@ out:
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
+/**
+ * freeze_super -- lock the filesystem and force it into a consistent state
+ * @super: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs.  Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ */
+int freeze_super(struct super_block *sb)
+{
+        int ret;
+        atomic_inc(&sb->s_active);
+        down_write(&sb->s_umount);
+        if (sb->s_frozen) {
+                deactivate_locked_super(sb);
+                return -EBUSY;
+        }
+        if (sb->s_flags & MS_RDONLY) {
+                sb->s_frozen = SB_FREEZE_TRANS;
+                smp_wmb();
+                up_write(&sb->s_umount);
+                return 0;
+        }
+        sb->s_frozen = SB_FREEZE_WRITE;
+        smp_wmb();
+        sync_filesystem(sb);
+        sb->s_frozen = SB_FREEZE_TRANS;
+        smp_wmb();
+        sync_blockdev(sb->s_bdev);
+        if (sb->s_op->freeze_fs) {
+                ret = sb->s_op->freeze_fs(sb);
+                if (ret) {
+                        printk(KERN_ERR
+                                "VFS:Filesystem freeze failed\n");
+                        sb->s_frozen = SB_UNFROZEN;
+                        deactivate_locked_super(sb);
+                        return ret;
+                }
+        }
+        up_write(&sb->s_umount);
+        return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+        int error;
+        down_write(&sb->s_umount);
+        if (sb->s_frozen == SB_UNFROZEN) {
+                up_write(&sb->s_umount);
+                return -EINVAL;
+        }
+        if (sb->s_flags & MS_RDONLY)
+                goto out;
+        if (sb->s_op->unfreeze_fs) {
+                error = sb->s_op->unfreeze_fs(sb);
+                if (error) {
+                        printk(KERN_ERR
+                                "VFS:Filesystem thaw failed\n");
+                        sb->s_frozen = SB_FREEZE_TRANS;
+                        up_write(&sb->s_umount);
+                        return error;
+                }
+        }
+out:
+        sb->s_frozen = SB_UNFROZEN;
+        smp_wmb();
+        wake_up(&sb->s_wait_unfrozen);
+        deactivate_locked_super(sb);
+        return 0;
+}
+EXPORT_SYMBOL(thaw_super);
 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
 {
        int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..e8cbd415e50a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-                writeback_inodes_sb(sb);
+                writeback_inodes_sb_locked(sb);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
+static void sync_one_sb(struct super_block *sb, void *arg)
+{
+        if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+                __sync_filesystem(sb, *(int *)arg);
+}
 /*
 * Sync all the data for all the filesystems (called by sys_sync() and
 * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
 */
 static void sync_filesystems(int wait)
 {
-        struct super_block *sb;
+        iterate_supers(sync_one_sb, &wait);
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);             /* Could be down_interruptible */
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list)
-                sb->s_need_sync = 1;
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_need_sync)
-                        continue;
-                sb->s_need_sync = 0;
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
-                        __sync_filesystem(sb, wait);
-                up_read(&sb->s_umount);
-                /* restart only when sb is no longer on the list */
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&mutex);
 }
 /*
@@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync);
 /**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
- * @dentry:             dentry of @file
 * @start:              offset in bytes of the beginning of data range to sync
 * @end:                offset in bytes of the end of data range (inclusive)
 * @datasync:           perform only datasync
@@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync);
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
 */
-int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
-                    loff_t end, int datasync)
 {
-        const struct file_operations *fop;
+        struct address_space *mapping = file->f_mapping;
-        struct address_space *mapping;
        int err, ret;
-        /*
+        if (!file->f_op || !file->f_op->fsync) {
-         * Get mapping and operations from the file in case we have
-         * as file, or get the default values for them in case we
-         * don't have a struct file available.  Damn nfsd..
-         */
-        if (file) {
-                mapping = file->f_mapping;
-                fop = file->f_op;
-        } else {
-                mapping = dentry->d_inode->i_mapping;
-                fop = dentry->d_inode->i_fop;
-        }
-        if (!fop || !fop->fsync) {
                ret = -EINVAL;
                goto out;
        }
@@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = fop->fsync(file, dentry, datasync);
+        err = file->f_op->fsync(file, file->f_path.dentry, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
 /**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:               file to sync
- * @dentry:             dentry of @file
 * @datasync:           only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set.  This can only happen when the filesystem
- * implements the export_operations API.
 */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync(struct file *file, int datasync)
 {
-        return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+        return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync);
@@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync)
        file = fget(fd);
        if (file) {
-                ret = vfs_fsync(file, file->f_path.dentry, datasync);
+                ret = vfs_fsync(file, datasync);
                fput(file);
        }
        return ret;
@@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
 {
        if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
                return 0;
-        return vfs_fsync_range(file, file->f_path.dentry, pos,
+        return vfs_fsync_range(file, pos, pos + count - 1,
-                               pos + count - 1,
                               (file->f_flags & __O_SYNC) ? 0 : 1);
 }
 EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e9d293593e52..4e321f7353fa 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -46,9 +46,9 @@ struct bin_buffer {
 };
 static int
-fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
+fill_read(struct file *file, char *buffer, loff_t off, size_t count)
 {
-        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        int rc;
@@ -59,7 +59,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
        rc = -EIO;
        if (attr->read)
-                rc = attr->read(kobj, attr, buffer, off, count);
+                rc = attr->read(file, kobj, attr, buffer, off, count);
        sysfs_put_active(attr_sd);
@@ -70,8 +70,7 @@ static ssize_t
 read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
        struct bin_buffer *bb = file->private_data;
-        struct dentry *dentry = file->f_path.dentry;
+        int size = file->f_path.dentry->d_inode->i_size;
-        int size = dentry->d_inode->i_size;
        loff_t offs = *off;
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
@@ -92,7 +91,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
        mutex_lock(&bb->mutex);
-        count = fill_read(dentry, bb->buffer, offs, count);
+        count = fill_read(file, bb->buffer, offs, count);
        if (count < 0) {
                mutex_unlock(&bb->mutex);
                goto out_free;
@@ -117,9 +116,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 }
 static int
-flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
+flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
 {
-        struct sysfs_dirent *attr_sd = dentry->d_fsdata;
+        struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
        struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
        struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
        int rc;
@@ -130,7 +129,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
        rc = -EIO;
        if (attr->write)
-                rc = attr->write(kobj, attr, buffer, offset, count);
+                rc = attr->write(file, kobj, attr, buffer, offset, count);
        sysfs_put_active(attr_sd);
@@ -141,8 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
                     size_t bytes, loff_t *off)
 {
        struct bin_buffer *bb = file->private_data;
-        struct dentry *dentry = file->f_path.dentry;
+        int size = file->f_path.dentry->d_inode->i_size;
-        int size = dentry->d_inode->i_size;
        loff_t offs = *off;
        int count = min_t(size_t, bytes, PAGE_SIZE);
        char *temp;
@@ -165,7 +163,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
        memcpy(bb->buffer, temp, count);
-        count = flush_write(dentry, bb->buffer, offs, count);
+        count = flush_write(file, bb->buffer, offs, count);
        mutex_unlock(&bb->mutex);
        if (count > 0)
@@ -363,7 +361,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
        if (!attr->mmap)
                goto out_put;
-        rc = attr->mmap(kobj, attr, vma);
+        rc = attr->mmap(file, kobj, attr, vma);
        if (rc)
                goto out_put;
@@ -501,7 +499,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 void sysfs_remove_bin_file(struct kobject *kobj,
                           const struct bin_attribute *attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->attr.name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 590717861c7a..7e54bac8c4b0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -380,7 +380,7 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
        struct sysfs_inode_attrs *ps_iattr;
-        if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+        if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
                return -EEXIST;
        sd->s_parent = sysfs_get(acxt->parent_sd);
@@ -533,13 +533,17 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name)
 {
        struct sysfs_dirent *sd;
-        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
+        for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
+                if (ns && sd->s_ns && (sd->s_ns != ns))
+                        continue;
                if (!strcmp(sd->s_name, name))
                        return sd;
+        }
        return NULL;
 }
@@ -558,12 +562,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 *      Pointer to sysfs_dirent if found, NULL if not.
 */
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name)
 {
        struct sysfs_dirent *sd;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, name);
+        sd = sysfs_find_dirent(parent_sd, ns, name);
        sysfs_get(sd);
        mutex_unlock(&sysfs_mutex);
@@ -572,7 +577,8 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-                      const char *name, struct sysfs_dirent **p_sd)
+        enum kobj_ns_type type, const void *ns, const char *name,
+        struct sysfs_dirent **p_sd)
 {
        umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
        struct sysfs_addrm_cxt acxt;
@@ -583,6 +589,9 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
        sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
        if (!sd)
                return -ENOMEM;
+        sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
+        sd->s_ns = ns;
        sd->s_dir.kobj = kobj;
        /* link in */
@@ -601,7 +610,33 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
                        struct sysfs_dirent **p_sd)
 {
-        return create_dir(kobj, kobj->sd, name, p_sd);
+        return create_dir(kobj, kobj->sd,
+                          KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
+}
+/**
+ *      sysfs_read_ns_type: return associated ns_type
+ *      @kobj: the kobject being queried
+ *
+ *      Each kobject can be tagged with exactly one namespace type
+ *      (i.e. network or user).  Return the ns_type associated with
+ *      this object if any
+ */
+static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
+{
+        const struct kobj_ns_type_operations *ops;
+        enum kobj_ns_type type;
+        ops = kobj_child_ns_ops(kobj);
+        if (!ops)
+                return KOBJ_NS_TYPE_NONE;
+        type = ops->type;
+        BUG_ON(type <= KOBJ_NS_TYPE_NONE);
+        BUG_ON(type >= KOBJ_NS_TYPES);
+        BUG_ON(!kobj_ns_type_registered(type));
+        return type;
 }
 /**
@@ -610,7 +645,9 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 */
 int sysfs_create_dir(struct kobject * kobj)
 {
+        enum kobj_ns_type type;
        struct sysfs_dirent *parent_sd, *sd;
+        const void *ns = NULL;
        int error = 0;
        BUG_ON(!kobj);
@@ -620,7 +657,11 @@ int sysfs_create_dir(struct kobject * kobj)
        else
                parent_sd = &sysfs_root;
-        error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
+        if (sysfs_ns_type(parent_sd))
+                ns = kobj->ktype->namespace(kobj);
+        type = sysfs_read_ns_type(kobj);
+        error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
        if (!error)
                kobj->sd = sd;
        return error;
@@ -630,13 +671,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
                                struct nameidata *nd)
 {
        struct dentry *ret = NULL;
-        struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+        struct dentry *parent = dentry->d_parent;
+        struct sysfs_dirent *parent_sd = parent->d_fsdata;
        struct sysfs_dirent *sd;
        struct inode *inode;
+        enum kobj_ns_type type;
+        const void *ns;
        mutex_lock(&sysfs_mutex);
-        sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dir->i_sb)->ns[type];
+        sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
        /* no such entry */
        if (!sd) {
@@ -735,7 +782,8 @@ void sysfs_remove_dir(struct kobject * kobj)
 }
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name)
+        struct sysfs_dirent *new_parent_sd, const void *new_ns,
+        const char *new_name)
 {
        const char *dup_name = NULL;
        int error;
@@ -743,12 +791,12 @@ int sysfs_rename(struct sysfs_dirent *sd,
        mutex_lock(&sysfs_mutex);
        error = 0;
-        if ((sd->s_parent == new_parent_sd) &&
+        if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
            (strcmp(sd->s_name, new_name) == 0))
                goto out;       /* nothing to rename */
        error = -EEXIST;
-        if (sysfs_find_dirent(new_parent_sd, new_name))
+        if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
                goto out;
        /* rename sysfs_dirent */
@@ -770,6 +818,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
                sd->s_parent = new_parent_sd;
                sysfs_link_sibling(sd);
        }
+        sd->s_ns = new_ns;
        error = 0;
 out:
@@ -780,19 +829,28 @@ int sysfs_rename(struct sysfs_dirent *sd,
 int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
 {
-        return sysfs_rename(kobj->sd, kobj->sd->s_parent, new_name);
+        struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+        const void *new_ns = NULL;
+        if (sysfs_ns_type(parent_sd))
+                new_ns = kobj->ktype->namespace(kobj);
+        return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
 }
 int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 {
        struct sysfs_dirent *sd = kobj->sd;
        struct sysfs_dirent *new_parent_sd;
+        const void *new_ns = NULL;
        BUG_ON(!sd->s_parent);
+        if (sysfs_ns_type(sd->s_parent))
+                new_ns = kobj->ktype->namespace(kobj);
        new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : &sysfs_root;
-        return sysfs_rename(sd, new_parent_sd, sd->s_name);
+        return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
 }
 /* Relationship between s_mode and the DT_xxx types */
@@ -807,32 +865,35 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
        return 0;
 }
-static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-        ino_t ino, struct sysfs_dirent *pos)
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
        if (pos) {
                int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
                        pos->s_parent == parent_sd &&
                        ino == pos->s_ino;
                sysfs_put(pos);
-                if (valid)
+                if (!valid)
-                        return pos;
+                        pos = NULL;
        }
-        pos = NULL;
+        if (!pos && (ino > 1) && (ino < INT_MAX)) {
-        if ((ino > 1) && (ino < INT_MAX)) {
                pos = parent_sd->s_dir.children;
                while (pos && (ino > pos->s_ino))
                        pos = pos->s_sibling;
        }
+        while (pos && pos->s_ns && pos->s_ns != ns)
+                pos = pos->s_sibling;
        return pos;
 }
-static struct sysfs_dirent *sysfs_dir_next_pos(struct sysfs_dirent *parent_sd,
+static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-        ino_t ino, struct sysfs_dirent *pos)
+        struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
 {
-        pos = sysfs_dir_pos(parent_sd, ino, pos);
+        pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
        if (pos)
                pos = pos->s_sibling;
+        while (pos && pos->s_ns && pos->s_ns != ns)
+                pos = pos->s_sibling;
        return pos;
 }
@@ -841,8 +902,13 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct dentry *dentry = filp->f_path.dentry;
        struct sysfs_dirent * parent_sd = dentry->d_fsdata;
        struct sysfs_dirent *pos = filp->private_data;
+        enum kobj_ns_type type;
+        const void *ns;
        ino_t ino;
+        type = sysfs_ns_type(parent_sd);
+        ns = sysfs_info(dentry->d_sb)->ns[type];
        if (filp->f_pos == 0) {
                ino = parent_sd->s_ino;
                if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
@@ -857,9 +923,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        filp->f_pos++;
        }
        mutex_lock(&sysfs_mutex);
-        for (pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+        for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
             pos;
-             pos = sysfs_dir_next_pos(parent_sd, filp->f_pos, pos)) {
+             pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
                const char * name;
                unsigned int type;
                int len, ret;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e222b2582746..1beaa739d0a6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -478,9 +478,12 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
        mutex_lock(&sysfs_mutex);
        if (sd && dir)
-                sd = sysfs_find_dirent(sd, dir);
+                /* Only directories are tagged, so no need to pass
+                 * a tag explicitly.
+                 */
+                sd = sysfs_find_dirent(sd, NULL, dir);
        if (sd && attr)
-                sd = sysfs_find_dirent(sd, attr);
+                sd = sysfs_find_dirent(sd, NULL, attr);
        if (sd)
                sysfs_notify_dirent(sd);
@@ -569,7 +572,7 @@ int sysfs_add_file_to_group(struct kobject *kobj,
        int error;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
@@ -599,7 +602,7 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
        mutex_lock(&sysfs_mutex);
        rc = -ENOENT;
-        sd = sysfs_find_dirent(kobj->sd, attr->name);
+        sd = sysfs_find_dirent(kobj->sd, NULL, attr->name);
        if (!sd)
                goto out;
@@ -624,7 +627,7 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->sd, attr->name);
+        sysfs_hash_and_remove(kobj->sd, NULL, attr->name);
 }
 void sysfs_remove_files(struct kobject * kobj, const struct attribute **ptr)
@@ -646,11 +649,11 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
        struct sysfs_dirent *dir_sd;
        if (group)
-                dir_sd = sysfs_get_dirent(kobj->sd, group);
+                dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
        else
                dir_sd = sysfs_get(kobj->sd);
        if (dir_sd) {
-                sysfs_hash_and_remove(dir_sd, attr->name);
+                sysfs_hash_and_remove(dir_sd, NULL, attr->name);
                sysfs_put(dir_sd);
        }
 }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe611949a7f7..23c1e598792a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -23,7 +23,7 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
        int i;
        for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-                sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 }
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -39,7 +39,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
                 * visibility.  Do this by first removing then
                 * re-adding (if required) the file */
                if (update)
-                        sysfs_hash_and_remove(dir_sd, (*attr)->name);
+                        sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
                if (grp->is_visible) {
                        mode = grp->is_visible(kobj, *attr, i);
                        if (!mode)
@@ -132,7 +132,7 @@ void sysfs_remove_group(struct kobject * kobj,
        struct sysfs_dirent *sd;
        if (grp->name) {
-                sd = sysfs_get_dirent(dir_sd, grp->name);
+                sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
                if (!sd) {
                        WARN(!sd, KERN_WARNING "sysfs group %p not found for "
                                "kobject '%s'\n", grp, kobject_name(kobj));
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a4a0a9419711..bbd77e95cf7f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -324,7 +324,7 @@ void sysfs_delete_inode(struct inode *inode)
        sysfs_put(sd);
 }
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name)
 {
        struct sysfs_addrm_cxt acxt;
        struct sysfs_dirent *sd;
@@ -334,7 +334,9 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
        sysfs_addrm_start(&acxt, dir_sd);
-        sd = sysfs_find_dirent(dir_sd, name);
+        sd = sysfs_find_dirent(dir_sd, ns, name);
+        if (sd && (sd->s_ns != ns))
+                sd = NULL;
        if (sd)
                sysfs_remove_one(&acxt, sd);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 776137828dca..281c0c9bc39f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -35,7 +35,7 @@ static const struct super_operations sysfs_ops = {
 struct sysfs_dirent sysfs_root = {
        .s_name         = "",
        .s_count        = ATOMIC_INIT(1),
-        .s_flags        = SYSFS_DIR,
+        .s_flags        = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
        .s_mode         = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
        .s_ino          = 1,
 };
@@ -72,18 +72,107 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 }
+static int sysfs_test_super(struct super_block *sb, void *data)
+{
+        struct sysfs_super_info *sb_info = sysfs_info(sb);
+        struct sysfs_super_info *info = data;
+        enum kobj_ns_type type;
+        int found = 1;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
+                if (sb_info->ns[type] != info->ns[type])
+                        found = 0;
+        }
+        return found;
+}
+static int sysfs_set_super(struct super_block *sb, void *data)
+{
+        int error;
+        error = set_anon_super(sb, data);
+        if (!error)
+                sb->s_fs_info = data;
+        return error;
+}
 static int sysfs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
+        struct sysfs_super_info *info;
+        enum kobj_ns_type type;
+        struct super_block *sb;
+        int error;
+        error = -ENOMEM;
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
+        if (!info)
+                goto out;
+        for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
+                info->ns[type] = kobj_ns_current(type);
+        sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info);
+        if (IS_ERR(sb) || sb->s_fs_info != info)
+                kfree(info);
+        if (IS_ERR(sb)) {
+                error = PTR_ERR(sb);
+                goto out;
+        }
+        if (!sb->s_root) {
+                sb->s_flags = flags;
+                error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        deactivate_locked_super(sb);
+                        goto out;
+                }
+                sb->s_flags |= MS_ACTIVE;
+        }
+        simple_set_mnt(mnt, sb);
+        error = 0;
+out:
+        return error;
+}
+static void sysfs_kill_sb(struct super_block *sb)
+{
+        struct sysfs_super_info *info = sysfs_info(sb);
+        /* Remove the superblock from fs_supers/s_instances
+         * so we can't find it, before freeing sysfs_super_info.
+         */
+        kill_anon_super(sb);
+        kfree(info);
 }
 static struct file_system_type sysfs_fs_type = {
        .name           = "sysfs",
        .get_sb         = sysfs_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = sysfs_kill_sb,
 };
+void sysfs_exit_ns(enum kobj_ns_type type, const void *ns)
+{
+        struct super_block *sb;
+        mutex_lock(&sysfs_mutex);
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) {
+                struct sysfs_super_info *info = sysfs_info(sb);
+                /*
+                 * If we see a superblock on the fs_supers/s_instances
+                 * list the unmount has not completed and sb->s_fs_info
+                 * points to a valid struct sysfs_super_info.
+                 */
+                /* Ignore superblocks with the wrong ns */
+                if (info->ns[type] != ns)
+                        continue;
+                info->ns[type] = NULL;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&sysfs_mutex);
+}
 int __init sysfs_init(void)
 {
        int err = -ENOMEM;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index b93ec51fa7ac..f71246bebfe4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -58,6 +58,8 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
        if (!sd)
                goto out_put;
+        if (sysfs_ns_type(parent_sd))
+                sd->s_ns = target->ktype->namespace(target);
        sd->s_symlink.target_sd = target_sd;
        target_sd = NULL;       /* reference is now owned by the symlink */
@@ -107,6 +109,26 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
 }
 /**
+ *      sysfs_delete_link - remove symlink in object's directory.
+ *      @kobj:  object we're acting for.
+ *      @targ:  object we're pointing to.
+ *      @name:  name of the symlink to remove.
+ *
+ *      Unlike sysfs_remove_link sysfs_delete_link has enough information
+ *      to successfully delete symlinks in tagged directories.
+ */
+void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
+                        const char *name)
+{
+        const void *ns = NULL;
+        spin_lock(&sysfs_assoc_lock);
+        if (targ->sd)
+                ns = targ->sd->s_ns;
+        spin_unlock(&sysfs_assoc_lock);
+        sysfs_hash_and_remove(kobj->sd, ns, name);
+}
+/**
 *      sysfs_remove_link - remove symlink in object's directory.
 *      @kobj:  object we're acting for.
 *      @name:  name of the symlink to remove.
@@ -121,7 +143,7 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
        else
                parent_sd = kobj->sd;
-        sysfs_hash_and_remove(parent_sd, name);
+        sysfs_hash_and_remove(parent_sd, NULL, name);
 }
 /**
@@ -137,6 +159,7 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
                        const char *old, const char *new)
 {
        struct sysfs_dirent *parent_sd, *sd = NULL;
+        const void *old_ns = NULL, *new_ns = NULL;
        int result;
        if (!kobj)
@@ -144,8 +167,11 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
        else
                parent_sd = kobj->sd;
+        if (targ->sd)
+                old_ns = targ->sd->s_ns;
        result = -ENOENT;
-        sd = sysfs_get_dirent(parent_sd, old);
+        sd = sysfs_get_dirent(parent_sd, old_ns, old);
        if (!sd)
                goto out;
@@ -155,7 +181,10 @@ int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
        if (sd->s_symlink.target_sd->s_dir.kobj != targ)
                goto out;
-        result = sysfs_rename(sd, parent_sd, new);
+        if (sysfs_ns_type(parent_sd))
+                new_ns = targ->ktype->namespace(targ);
+        result = sysfs_rename(sd, parent_sd, new_ns, new);
 out:
        sysfs_put(sd);
@@ -261,3 +290,4 @@ const struct inode_operations sysfs_symlink_inode_operations = {
 EXPORT_SYMBOL_GPL(sysfs_create_link);
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
+EXPORT_SYMBOL_GPL(sysfs_rename_link);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 30f5a44fb5d3..6a13105b5594 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -58,6 +58,7 @@ struct sysfs_dirent {
        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
+        const void              *s_ns; /* namespace tag */
        union {
                struct sysfs_elem_dir           s_dir;
                struct sysfs_elem_symlink       s_symlink;
@@ -81,14 +82,27 @@ struct sysfs_dirent {
 #define SYSFS_COPY_NAME                 (SYSFS_DIR | SYSFS_KOBJ_LINK)
 #define SYSFS_ACTIVE_REF                (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-#define SYSFS_FLAG_MASK                 ~SYSFS_TYPE_MASK
+/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_FLAG_REMOVED              0x0200
+#define SYSFS_NS_TYPE_MASK              0xff00
+#define SYSFS_NS_TYPE_SHIFT             8
+#define SYSFS_FLAG_MASK                 ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
+#define SYSFS_FLAG_REMOVED              0x020000
 static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
 {
        return sd->s_flags & SYSFS_TYPE_MASK;
 }
+/*
+ * Return any namespace tags on this dirent.
+ * enum kobj_ns_type is defined in linux/kobject.h
+ */
+static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
+{
+        return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
+}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define sysfs_dirent_init_lockdep(sd)                           \
 do {                                                            \
@@ -114,6 +128,16 @@ struct sysfs_addrm_cxt {
 /*
 * mount.c
 */
+/*
+ * Each sb is associated with a set of namespace tags (i.e.
+ * the network namespace of the task which mounted this sysfs
+ * instance).
+ */
+struct sysfs_super_info {
+        const void *ns[KOBJ_NS_TYPES];
+};
+#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
 extern struct kmem_cache *sysfs_dir_cachep;
@@ -137,8 +161,10 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+                                       const void *ns,
                                       const unsigned char *name);
 struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+                                      const void *ns,
                                      const unsigned char *name);
 struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
@@ -149,7 +175,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 int sysfs_rename(struct sysfs_dirent *sd,
-        struct sysfs_dirent *new_parent_sd, const char *new_name);
+        struct sysfs_dirent *new_parent_sd, const void *ns, const char *new_name);
 static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
@@ -179,7 +205,7 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const char *name);
 int sysfs_inode_init(void);
 /*
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..bbd69bdb0fa8 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
        fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
        dirty_sb(sb);
-        
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_uid = current_fsuid();
        inode->i_ino = fs16_to_cpu(sbi, ino);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        inode->i_mode = mode;           /* for sysv_write_inode() */
        sysv_write_inode(inode, 0);     /* ensure inode not allocated again */
        mark_inode_dirty(inode);        /* cleared by sysv_write_inode() */
        /* That's it. */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 98158de91d24..b86ab8eff79a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -110,31 +110,14 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
        struct timerfd_ctx *ctx = file->private_data;
        ssize_t res;
        u64 ticks = 0;
-        DECLARE_WAITQUEUE(wait, current);
        if (count < sizeof(ticks))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
-        res = -EAGAIN;
+        if (file->f_flags & O_NONBLOCK)
-        if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) {
+                res = -EAGAIN;
-                __add_wait_queue(&ctx->wqh, &wait);
+        else
-                for (res = 0;;) {
+                res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (ctx->ticks) {
-                                res = 0;
-                                break;
-                        }
-                        if (signal_pending(current)) {
-                                res = -ERESTARTSYS;
-                                break;
-                        }
-                        spin_unlock_irq(&ctx->wqh.lock);
-                        schedule();
-                        spin_lock_irq(&ctx->wqh.lock);
-                }
-                __remove_wait_queue(&ctx->wqh, &wait);
-                __set_current_state(TASK_RUNNING);
-        }
        if (ctx->ticks) {
                ticks = ctx->ticks;
                if (ctx->expired && ctx->tintv.tv64) {
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
         */
        inode->i_flags |= (S_NOCMTIME);
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
-        inode->i_mode = mode;
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
        inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 77d5cf4a7547..bcf5a16f30bb 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -64,6 +64,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
        if (!c->ro_media) {
                c->ro_media = 1;
                c->no_chk_data_crc = 0;
+                c->vfs_sb->s_flags |= MS_RDONLY;
                ubifs_warn("switched to read-only mode, error %d", err);
                dbg_dump_stack();
        }
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f0f2a436251e..3a84455c2a77 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -209,6 +209,6 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 const struct file_operations udf_dir_operations = {
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
-        .ioctl                  = udf_ioctl,
+        .unlocked_ioctl         = udf_ioctl,
        .fsync                  = simple_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 4b6a46ccbf46..baae3a723946 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -37,6 +37,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
+#include <linux/smp_lock.h>
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -144,50 +145,60 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        return retval;
 }
-int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        long old_block, new_block;
        int result = -EINVAL;
+        lock_kernel();
        if (file_permission(filp, MAY_READ) != 0) {
-                udf_debug("no permission to access inode %lu\n",
+                udf_debug("no permission to access inode %lu\n", inode->i_ino);
-                          inode->i_ino);
+                result = -EPERM;
-                return -EPERM;
+                goto out;
        }
        if (!arg) {
                udf_debug("invalid argument to udf_ioctl\n");
-                return -EINVAL;
+                result = -EINVAL;
+                goto out;
        }
        switch (cmd) {
        case UDF_GETVOLIDENT:
                if (copy_to_user((char __user *)arg,
                                 UDF_SB(inode->i_sb)->s_volume_ident, 32))
-                        return -EFAULT;
+                        result = -EFAULT;
                else
-                        return 0;
+                        result = 0;
+                goto out;
        case UDF_RELOCATE_BLOCKS:
-                if (!capable(CAP_SYS_ADMIN))
+                if (!capable(CAP_SYS_ADMIN)) {
-                        return -EACCES;
+                        result = -EACCES;
-                if (get_user(old_block, (long __user *)arg))
+                        goto out;
-                        return -EFAULT;
+                }
+                if (get_user(old_block, (long __user *)arg)) {
+                        result = -EFAULT;
+                        goto out;
+                }
                result = udf_relocate_blocks(inode->i_sb,
                                                old_block, &new_block);
                if (result == 0)
                        result = put_user(new_block, (long __user *)arg);
-                return result;
+                goto out;
        case UDF_GETEASIZE:
                result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
-                break;
+                goto out;
        case UDF_GETEABLOCK:
                result = copy_to_user((char __user *)arg,
                                      UDF_I(inode)->i_ext.i_data,
                                      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
-                break;
+                goto out;
        }
+out:
+        unlock_kernel();
        return result;
 }
@@ -207,7 +218,7 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 const struct file_operations udf_file_operations = {
        .read                   = do_sync_read,
        .aio_read               = generic_file_aio_read,
-        .ioctl                  = udf_ioctl,
+        .unlocked_ioctl         = udf_ioctl,
        .open                   = dquot_file_open,
        .mmap                   = generic_file_mmap,
        .write                  = do_sync_write,
@@ -227,7 +238,7 @@ int udf_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                return error;
-        if (iattr->ia_valid & ATTR_SIZE)
+        if (is_quota_modification(inode, iattr))
                dquot_initialize(inode);
        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..2b5586c7f02a 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
-        inode->i_mode = mode;
-        inode->i_uid = current_fsuid();
+        inode_init_owner(inode, dir, mode);
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        mode |= S_ISGID;
-        } else {
-                inode->i_gid = current_fsgid();
-        }
        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95f..585f733615dc 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
                inode->i_data.a_ops = &udf_aops;
        inode->i_op = &udf_file_inode_operations;
        inode->i_fop = &udf_file_operations;
-        inode->i_mode = mode;
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                goto out;
        iinfo = UDF_I(inode);
-        inode->i_uid = current_fsuid();
        init_special_inode(inode, mode, rdev);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out;
        err = -EIO;
-        inode = udf_new_inode(dir, S_IFDIR, &err);
+        inode = udf_new_inode(dir, S_IFDIR | mode, &err);
        if (!inode)
                goto out;
@@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                        FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
        udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
        brelse(fibh.sbh);
-        inode->i_mode = S_IFDIR | mode;
-        if (dir->i_mode & S_ISGID)
-                inode->i_mode |= S_ISGID;
        mark_inode_dirty(inode);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        dquot_initialize(dir);
        lock_kernel();
-        inode = udf_new_inode(dir, S_IFLNK, &err);
+        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
                goto out;
@@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        }
        iinfo = UDF_I(inode);
-        inode->i_mode = S_IFLNK | S_IRWXUGO;
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 702a1148e702..9079ff7d6255 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -130,8 +130,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
                        uint8_t *, uint8_t *);
 /* file.c */
-extern int udf_ioctl(struct inode *, struct file *, unsigned int,
+extern long udf_ioctl(struct file *, unsigned int, unsigned long);
-                     unsigned long);
 extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..3a959d55084d 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -303,15 +303,7 @@ cg_found:
        sb->s_dirt = 1;
        inode->i_ino = cg * uspi->s_ipg + bit;
-        inode->i_mode = mode;
+        inode_init_owner(inode, dir, mode);
-        inode->i_uid = current_fsuid();
-        if (dir->i_mode & S_ISGID) {
-                inode->i_gid = dir->i_gid;
-                if (S_ISDIR(mode))
-                        inode->i_mode |= S_ISGID;
-        } else
-                inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_generation = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 80b68c3702d1..cffa756f1047 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -603,7 +603,7 @@ static void ufs_set_inode_ops(struct inode *inode)
                if (!inode->i_blocks)
                        inode->i_op = &ufs_fast_symlink_inode_operations;
                else {
-                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_op = &ufs_symlink_inode_operations;
                        inode->i_mapping->a_ops = &ufs_aops;
                }
        } else
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 118556243e7a..eabc02eb1294 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -148,7 +148,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
                /* slow symlink */
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &ufs_symlink_inode_operations;
                inode->i_mapping->a_ops = &ufs_aops;
                err = page_symlink(inode, symname, l);
                if (err)
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index c0156eda44bc..d283628b4778 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -42,4 +42,12 @@ static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
 const struct inode_operations ufs_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ufs_follow_link,
+        .setattr        = ufs_setattr,
+};
+const struct inode_operations ufs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = ufs_setattr,
 };
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index d3b6270cb377..f294c44577dc 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -508,7 +508,7 @@ out:
 * - there is no way to know old size
 * - there is no way inform user about error, if it happens in `truncate'
 */
-static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        unsigned int ia_valid = attr->ia_valid;
@@ -518,18 +518,18 @@ static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
+        if (is_quota_modification(inode, attr))
+                dquot_initialize(inode);
        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
                error = dquot_transfer(inode, attr);
                if (error)
                        return error;
        }
-        if (ia_valid & ATTR_SIZE &&
+        if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
-            attr->ia_size != i_size_read(inode)) {
                loff_t old_i_size = inode->i_size;
-                dquot_initialize(inode);
                error = vmtruncate(inode, attr->ia_size);
                if (error)
                        return error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 43f9f5d5670e..179ae6b3180a 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -122,9 +122,11 @@ extern void ufs_panic (struct super_block *, const char *, const char *, ...) __
 /* symlink.c */
 extern const struct inode_operations ufs_fast_symlink_inode_operations;
+extern const struct inode_operations ufs_symlink_inode_operations;
 /* truncate.c */
 extern int ufs_truncate (struct inode *, loff_t);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
 static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
 {
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
 /*
 * Find the xattr_handler with the matching prefix.
 */
-static struct xattr_handler *
+static const struct xattr_handler *
-xattr_resolve_name(struct xattr_handler **handlers, const char **name)
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        if (!*name)
                return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
 ssize_t
 generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+        const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
        unsigned int size = 0;
        if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 int
 generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 int
 generic_removexattr(struct dentry *dentry, const char *name)
 {
-        struct xattr_handler *handler;
+        const struct xattr_handler *handler;
        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_itable.o \
                                   xfs_dfrag.o \
                                   xfs_log.o \
+                                   xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
                                   xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d60..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
        return error;
 }
-struct xattr_handler xfs_xattr_acl_access_handler = {
+const struct xattr_handler xfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
        .flags  = ACL_TYPE_ACCESS,
        .get    = xfs_xattr_acl_get,
        .set    = xfs_xattr_acl_set,
 };
-struct xattr_handler xfs_xattr_acl_default_handler = {
+const struct xattr_handler xfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
        .flags  = ACL_TYPE_DEFAULT,
        .get    = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 0f8b9968a803..089eaca860b4 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -45,6 +45,15 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+        IO_READ,        /* mapping for a read */
+        IO_DELAY,       /* mapping covers delalloc region */
+        IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+        IO_NEW          /* just allocated */
+};
 /*
 * Prime number of hash buckets since address is used as the key.
@@ -103,8 +112,9 @@ xfs_count_page_state(
 STATIC struct block_device *
 xfs_find_bdev_for_inode(
-        struct xfs_inode        *ip)
+        struct inode            *inode)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
        if (XFS_IS_REALTIME_INODE(ip))
@@ -183,7 +193,7 @@ xfs_setfilesize(
        xfs_fsize_t             isize;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-        ASSERT(ioend->io_type != IOMAP_READ);
+        ASSERT(ioend->io_type != IO_READ);
        if (unlikely(ioend->io_error))
                return 0;
@@ -214,7 +224,7 @@ xfs_finish_ioend(
        if (atomic_dec_and_test(&ioend->io_remaining)) {
                struct workqueue_struct *wq;
-                wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
+                wq = (ioend->io_type == IO_UNWRITTEN) ?
                        xfsconvertd_workqueue : xfsdatad_workqueue;
                queue_work(wq, &ioend->io_work);
                if (wait)
@@ -237,7 +247,7 @@ xfs_end_io(
         * For unwritten extents we need to issue transactions to convert a
         * range to normal written extens after the data I/O has finished.
         */
-        if (ioend->io_type == IOMAP_UNWRITTEN &&
+        if (ioend->io_type == IO_UNWRITTEN &&
            likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
@@ -250,7 +260,7 @@ xfs_end_io(
         * We might have to update the on-disk file size after extending
         * writes.
         */
-        if (ioend->io_type != IOMAP_READ) {
+        if (ioend->io_type != IO_READ) {
                error = xfs_setfilesize(ioend);
                ASSERT(!error || error == EAGAIN);
        }
@@ -309,21 +319,25 @@ xfs_map_blocks(
        struct inode            *inode,
        loff_t                  offset,
        ssize_t                 count,
-        xfs_iomap_t             *mapp,
+        struct xfs_bmbt_irec    *imap,
        int                     flags)
 {
        int                     nmaps = 1;
+        int                     new = 0;
-        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new);
 }
 STATIC int
-xfs_iomap_valid(
+xfs_imap_valid(
-        xfs_iomap_t             *iomapp,
+        struct inode            *inode,
-        loff_t                  offset)
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
 {
-        return offset >= iomapp->iomap_offset &&
+        offset >>= inode->i_blkbits;
-                offset < iomapp->iomap_offset + iomapp->iomap_bsize;
+        return offset >= imap->br_startoff &&
+                offset < imap->br_startoff + imap->br_blockcount;
 }
 /*
@@ -554,19 +568,23 @@ xfs_add_to_ioend(
 STATIC void
 xfs_map_buffer(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
-        xfs_off_t               offset,
+        xfs_off_t               offset)
-        uint                    block_bits)
 {
        sector_t                bn;
+        struct xfs_mount        *m = XFS_I(inode)->i_mount;
+        xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+        xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-        ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL);
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-        bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) +
+        bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-              ((offset - mp->iomap_offset) >> block_bits);
+              ((offset - iomap_offset) >> inode->i_blkbits);
-        ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME));
+        ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
        bh->b_blocknr = bn;
        set_buffer_mapped(bh);
@@ -574,17 +592,17 @@ xfs_map_buffer(
 STATIC void
 xfs_map_at_offset(
+        struct inode            *inode,
        struct buffer_head      *bh,
-        loff_t                  offset,
+        struct xfs_bmbt_irec    *imap,
-        int                     block_bits,
+        xfs_off_t               offset)
-        xfs_iomap_t             *iomapp)
 {
-        ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
+        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-        ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
        lock_buffer(bh);
-        xfs_map_buffer(bh, iomapp, offset, block_bits);
+        xfs_map_buffer(inode, bh, imap, offset);
-        bh->b_bdev = iomapp->iomap_target->bt_bdev;
+        bh->b_bdev = xfs_find_bdev_for_inode(inode);
        set_buffer_mapped(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
@@ -713,11 +731,11 @@ xfs_is_delayed_page(
                bh = head = page_buffers(page);
                do {
                        if (buffer_unwritten(bh))
-                                acceptable = (type == IOMAP_UNWRITTEN);
+                                acceptable = (type == IO_UNWRITTEN);
                        else if (buffer_delay(bh))
-                                acceptable = (type == IOMAP_DELAY);
+                                acceptable = (type == IO_DELAY);
                        else if (buffer_dirty(bh) && buffer_mapped(bh))
-                                acceptable = (type == IOMAP_NEW);
+                                acceptable = (type == IO_NEW);
                        else
                                break;
                } while ((bh = bh->b_this_page) != head);
@@ -740,7 +758,7 @@ xfs_convert_page(
        struct inode            *inode,
        struct page             *page,
        loff_t                  tindex,
-        xfs_iomap_t             *mp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
        int                     startio,
@@ -750,7 +768,6 @@ xfs_convert_page(
        xfs_off_t               end_offset;
        unsigned long           p_offset;
        unsigned int            type;
-        int                     bbits = inode->i_blkbits;
        int                     len, page_dirty;
        int                     count = 0, done = 0, uptodate = 1;
        xfs_off_t               offset = page_offset(page);
@@ -802,19 +819,19 @@ xfs_convert_page(
                if (buffer_unwritten(bh) || buffer_delay(bh)) {
                        if (buffer_unwritten(bh))
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                        else
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
-                        if (!xfs_iomap_valid(mp, offset)) {
+                        if (!xfs_imap_valid(inode, imap, offset)) {
                                done = 1;
                                continue;
                        }
-                        ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
+                        ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-                        ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
+                        ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-                        xfs_map_at_offset(bh, offset, bbits, mp);
+                        xfs_map_at_offset(inode, bh, imap, offset);
                        if (startio) {
                                xfs_add_to_ioend(inode, bh, offset,
                                                type, ioendp, done);
@@ -826,7 +843,7 @@ xfs_convert_page(
                        page_dirty--;
                        count++;
                } else {
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (buffer_mapped(bh) && all_bh && startio) {
                                lock_buffer(bh);
                                xfs_add_to_ioend(inode, bh, offset,
@@ -866,7 +883,7 @@ STATIC void
 xfs_cluster_write(
        struct inode            *inode,
        pgoff_t                 tindex,
-        xfs_iomap_t             *iomapp,
+        struct xfs_bmbt_irec    *imap,
        xfs_ioend_t             **ioendp,
        struct writeback_control *wbc,
        int                     startio,
@@ -885,7 +902,7 @@ xfs_cluster_write(
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                        iomapp, ioendp, wbc, startio, all_bh);
+                                        imap, ioendp, wbc, startio, all_bh);
                        if (done)
                                break;
                }
@@ -930,7 +947,7 @@ xfs_aops_discard_page(
        loff_t                  offset = page_offset(page);
        ssize_t                 len = 1 << inode->i_blkbits;
-        if (!xfs_is_delayed_page(page, IOMAP_DELAY))
+        if (!xfs_is_delayed_page(page, IO_DELAY))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1042,15 +1059,15 @@ xfs_page_state_convert(
        int             unmapped) /* also implies page uptodate */
 {
        struct buffer_head      *bh, *head;
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_ioend_t             *ioend = NULL, *iohead = NULL;
        loff_t                  offset;
        unsigned long           p_offset = 0;
        unsigned int            type;
        __uint64_t              end_offset;
-        pgoff_t                 end_index, last_index, tlast;
+        pgoff_t                 end_index, last_index;
        ssize_t                 size, len;
-        int                     flags, err, iomap_valid = 0, uptodate = 1;
+        int                     flags, err, imap_valid = 0, uptodate = 1;
        int                     page_dirty, count = 0;
        int                     trylock = 0;
        int                     all_bh = unmapped;
@@ -1097,7 +1114,7 @@ xfs_page_state_convert(
        bh = head = page_buffers(page);
        offset = page_offset(page);
        flags = BMAPI_READ;
-        type = IOMAP_NEW;
+        type = IO_NEW;
        /* TODO: cleanup count and page_dirty */
@@ -1111,12 +1128,12 @@ xfs_page_state_convert(
                         * the iomap is actually still valid, but the ioend
                         * isn't.  shouldn't happen too often.
                         */
-                        iomap_valid = 0;
+                        imap_valid = 0;
                        continue;
                }
-                if (iomap_valid)
+                if (imap_valid)
-                        iomap_valid = xfs_iomap_valid(&iomap, offset);
+                        imap_valid = xfs_imap_valid(inode, &imap, offset);
                /*
                 * First case, map an unwritten extent and prepare for
@@ -1137,20 +1154,20 @@ xfs_page_state_convert(
                         * Make sure we don't use a read-only iomap
                         */
                        if (flags == BMAPI_READ)
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        if (buffer_unwritten(bh)) {
-                                type = IOMAP_UNWRITTEN;
+                                type = IO_UNWRITTEN;
                                flags = BMAPI_WRITE | BMAPI_IGNSTATE;
                        } else if (buffer_delay(bh)) {
-                                type = IOMAP_DELAY;
+                                type = IO_DELAY;
                                flags = BMAPI_ALLOCATE | trylock;
                        } else {
-                                type = IOMAP_NEW;
+                                type = IO_NEW;
                                flags = BMAPI_WRITE | BMAPI_MMAP;
                        }
-                        if (!iomap_valid) {
+                        if (!imap_valid) {
                                /*
                                 * if we didn't have a valid mapping then we
                                 * need to ensure that we put the new mapping
@@ -1160,7 +1177,7 @@ xfs_page_state_convert(
                                 * for unwritten extent conversion.
                                 */
                                new_ioend = 1;
-                                if (type == IOMAP_NEW) {
+                                if (type == IO_NEW) {
                                        size = xfs_probe_cluster(inode,
                                                        page, bh, head, 0);
                                } else {
@@ -1168,14 +1185,14 @@ xfs_page_state_convert(
                                }
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
-                        if (iomap_valid) {
+                        if (imap_valid) {
-                                xfs_map_at_offset(bh, offset,
+                                xfs_map_at_offset(inode, bh, &imap, offset);
-                                                inode->i_blkbits, &iomap);
                                if (startio) {
                                        xfs_add_to_ioend(inode, bh, offset,
                                                        type, &ioend,
@@ -1194,40 +1211,41 @@ xfs_page_state_convert(
                         * That means it must already have extents allocated
                         * underneath it. Map the extent by reading it.
                         */
-                        if (!iomap_valid || flags != BMAPI_READ) {
+                        if (!imap_valid || flags != BMAPI_READ) {
                                flags = BMAPI_READ;
                                size = xfs_probe_cluster(inode, page, bh,
                                                                head, 1);
                                err = xfs_map_blocks(inode, offset, size,
-                                                &iomap, flags);
+                                                &imap, flags);
                                if (err)
                                        goto error;
-                                iomap_valid = xfs_iomap_valid(&iomap, offset);
+                                imap_valid = xfs_imap_valid(inode, &imap,
+                                                            offset);
                        }
                        /*
-                         * We set the type to IOMAP_NEW in case we are doing a
+                         * We set the type to IO_NEW in case we are doing a
                         * small write at EOF that is extending the file but
                         * without needing an allocation. We need to update the
                         * file size on I/O completion in this case so it is
                         * the same case as having just allocated a new extent
                         * that we are writing into for the first time.
                         */
-                        type = IOMAP_NEW;
+                        type = IO_NEW;
                        if (trylock_buffer(bh)) {
                                ASSERT(buffer_mapped(bh));
-                                if (iomap_valid)
+                                if (imap_valid)
                                        all_bh = 1;
                                xfs_add_to_ioend(inode, bh, offset, type,
-                                                &ioend, !iomap_valid);
+                                                &ioend, !imap_valid);
                                page_dirty--;
                                count++;
                        } else {
-                                iomap_valid = 0;
+                                imap_valid = 0;
                        }
                } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
                           (unmapped || startio)) {
-                        iomap_valid = 0;
+                        imap_valid = 0;
                }
                if (!iohead)
@@ -1241,12 +1259,23 @@ xfs_page_state_convert(
        if (startio)
                xfs_start_page_writeback(page, 1, count);
-        if (ioend && iomap_valid) {
+        if (ioend && imap_valid) {
-                offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
+                xfs_off_t               end_index;
-                                        PAGE_CACHE_SHIFT;
-                tlast = min_t(pgoff_t, offset, last_index);
+                end_index = imap.br_startoff + imap.br_blockcount;
-                xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-                                        wbc, startio, all_bh, tlast);
+                /* to bytes */
+                end_index <<= inode->i_blkbits;
+                /* to pages */
+                end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+                /* check against file size */
+                if (end_index > last_index)
+                        end_index = last_index;
+                xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+                                        wbc, startio, all_bh, end_index);
        }
        if (iohead)
@@ -1448,10 +1477,11 @@ __xfs_get_blocks(
        int                     direct,
        bmapi_flags_t           flags)
 {
-        xfs_iomap_t             iomap;
+        struct xfs_bmbt_irec    imap;
        xfs_off_t               offset;
        ssize_t                 size;
-        int                     niomap = 1;
+        int                     nimap = 1;
+        int                     new = 0;
        int                     error;
        offset = (xfs_off_t)iblock << inode->i_blkbits;
@@ -1462,22 +1492,21 @@ __xfs_get_blocks(
                return 0;
        error = xfs_iomap(XFS_I(inode), offset, size,
-                             create ? flags : BMAPI_READ, &iomap, &niomap);
+                             create ? flags : BMAPI_READ, &imap, &nimap, &new);
        if (error)
                return -error;
-        if (niomap == 0)
+        if (nimap == 0)
                return 0;
-        if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+        if (imap.br_startblock != HOLESTARTBLOCK &&
+            imap.br_startblock != DELAYSTARTBLOCK) {
                /*
                 * For unwritten extents do not report a disk address on
                 * the read case (treat as if we're reading into a hole).
                 */
-                if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(bh_result, &iomap, offset,
+                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                                       inode->i_blkbits);
+                if (create && ISUNWRITTEN(&imap)) {
-                }
-                if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
                        if (direct)
                                bh_result->b_private = inode;
                        set_buffer_unwritten(bh_result);
@@ -1488,7 +1517,7 @@ __xfs_get_blocks(
         * If this is a realtime file, data may be on a different device.
         * to that pointed to from the buffer_head b_bdev currently.
         */
-        bh_result->b_bdev = iomap.iomap_target->bt_bdev;
+        bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
        /*
         * If we previously allocated a block out beyond eof and we are now
@@ -1502,10 +1531,10 @@ __xfs_get_blocks(
        if (create &&
            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
             (offset >= i_size_read(inode)) ||
-             (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN))))
+             (new || ISUNWRITTEN(&imap))))
                set_buffer_new(bh_result);
-        if (iomap.iomap_flags & IOMAP_DELAY) {
+        if (imap.br_startblock == DELAYSTARTBLOCK) {
                BUG_ON(direct);
                if (create) {
                        set_buffer_uptodate(bh_result);
@@ -1514,11 +1543,23 @@ __xfs_get_blocks(
                }
        }
+        /*
+         * If this is O_DIRECT or the mpage code calling tell them how large
+         * the mapping is, so that we can avoid repeated get_blocks calls.
+         */
        if (direct || size > (1 << inode->i_blkbits)) {
-                ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
+                xfs_off_t               mapping_size;
-                offset = min_t(xfs_off_t,
-                                iomap.iomap_bsize - iomap.iomap_delta, size);
+                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset);
+                mapping_size <<= inode->i_blkbits;
+                ASSERT(mapping_size > 0);
+                if (mapping_size > size)
+                        mapping_size = size;
+                if (mapping_size > LONG_MAX)
+                        mapping_size = LONG_MAX;
+                bh_result->b_size = mapping_size;
        }
        return 0;
@@ -1576,7 +1617,7 @@ xfs_end_io_direct(
         */
        ioend->io_offset = offset;
        ioend->io_size = size;
-        if (ioend->io_type == IOMAP_READ) {
+        if (ioend->io_type == IO_READ) {
                xfs_finish_ioend(ioend, 0);
        } else if (private && size > 0) {
                xfs_finish_ioend(ioend, is_sync_kiocb(iocb));
@@ -1587,7 +1628,7 @@ xfs_end_io_direct(
                 * didn't map an unwritten extent so switch it's completion
                 * handler.
                 */
-                ioend->io_type = IOMAP_NEW;
+                ioend->io_type = IO_NEW;
                xfs_finish_ioend(ioend, 0);
        }
@@ -1612,10 +1653,10 @@ xfs_vm_direct_IO(
        struct block_device *bdev;
        ssize_t         ret;
-        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
+        bdev = xfs_find_bdev_for_inode(inode);
        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-                                        IOMAP_UNWRITTEN : IOMAP_READ);
+                                        IO_UNWRITTEN : IO_READ);
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44c2b0ef9a41..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
 *      Note that this in no way locks the underlying pages, so it is only
 *      useful for synchronizing concurrent use of buffer objects, not for
 *      synchronizing independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we
+ *      are being asked to lock a buffer that has been reallocated. Because
+ *      it is pinned, we know that the log has not been pushed to disk and
+ *      hence it will still be locked. Rather than sleeping until someone
+ *      else pushes the log, push it ourselves before trying to get the lock.
 */
 void
 xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
 {
        trace_xfs_buf_lock(bp, _RET_IP_);
+        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
@@ -1007,25 +1016,20 @@ xfs_bwrite(
        struct xfs_mount        *mp,
        struct xfs_buf          *bp)
 {
-        int                     iowait = (bp->b_flags & XBF_ASYNC) == 0;
+        int                     error;
-        int                     error = 0;
        bp->b_strat = xfs_bdstrat_cb;
        bp->b_mount = mp;
        bp->b_flags |= XBF_WRITE;
-        if (!iowait)
+        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
-                bp->b_flags |= _XBF_RUN_QUEUES;
        xfs_buf_delwri_dequeue(bp);
        xfs_buf_iostrategy(bp);
-        if (iowait) {
+        error = xfs_buf_iowait(bp);
-                error = xfs_buf_iowait(bp);
+        if (error)
-                if (error)
+                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+        xfs_buf_relse(bp);
-                xfs_buf_relse(bp);
-        }
        return error;
 }
@@ -1614,7 +1618,8 @@ xfs_mapping_buftarg(
 STATIC int
 xfs_alloc_delwrite_queue(
-        xfs_buftarg_t           *btp)
+        xfs_buftarg_t           *btp,
+        const char              *fsname)
 {
        int     error = 0;
@@ -1622,7 +1627,7 @@ xfs_alloc_delwrite_queue(
        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
        spin_lock_init(&btp->bt_delwrite_lock);
        btp->bt_flags = 0;
-        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
+        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
        if (IS_ERR(btp->bt_task)) {
                error = PTR_ERR(btp->bt_task);
                goto out_error;
@@ -1635,7 +1640,8 @@ out_error:
 xfs_buftarg_t *
 xfs_alloc_buftarg(
        struct block_device     *bdev,
-        int                     external)
+        int                     external,
+        const char              *fsname)
 {
        xfs_buftarg_t           *btp;
@@ -1647,7 +1653,7 @@ xfs_alloc_buftarg(
                goto error;
        if (xfs_mapping_buftarg(btp, bdev))
                goto error;
-        if (xfs_alloc_delwrite_queue(btp))
+        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
        xfs_alloc_bufhash(btp, external);
        return btp;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 386e7361e50e..5fbecefa5dfd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 /*
 *      Handling of buftargs.
 */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *);
 extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 42dd3bcfba6b..d8fb1b5d6cb5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -115,6 +115,8 @@ xfs_file_fsync(
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        xfs_ioend_wait(ip);
        /*
         * We always need to make sure that the required inode state is safe on
         * disk.  The inode might be clean but we still might need to force the
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 7b26cc2fd284..699b60cbab9c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -527,6 +527,10 @@ xfs_attrmulti_by_handle(
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 593c05b4df8d..9287135e9bfc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -420,6 +420,10 @@ xfs_compat_attrmulti_by_handle(
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        /* overflow check */
+        if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+                return -E2BIG;
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e65a7937f3a4..9c8019c78c92 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,7 +673,10 @@ xfs_vn_fiemap(
                bm.bmv_length = BTOBB(length);
        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = fieinfo->fi_extents_max + 1;
+        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+                                        fieinfo->fi_extents_max + 1;
+        bm.bmv_count = min_t(__s32, bm.bmv_count,
+                             (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
        bm.bmv_iflags = BMV_IF_PREALLOC;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 1947514ce1ad..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
 #include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
@@ -97,7 +98,7 @@ xfs_fs_set_xstate(
 }
 STATIC int
-xfs_fs_get_xquota(
+xfs_fs_get_dqblk(
        struct super_block      *sb,
        int                     type,
        qid_t                   id,
@@ -114,7 +115,7 @@ xfs_fs_get_xquota(
 }
 STATIC int
-xfs_fs_set_xquota(
+xfs_fs_set_dqblk(
        struct super_block      *sb,
        int                     type,
        qid_t                   id,
@@ -135,6 +136,6 @@ xfs_fs_set_xquota(
 const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstate             = xfs_fs_get_xstate,
        .set_xstate             = xfs_fs_set_xstate,
-        .get_xquota             = xfs_fs_get_xquota,
+        .get_dqblk              = xfs_fs_get_dqblk,
-        .set_xquota             = xfs_fs_set_xquota,
+        .set_dqblk              = xfs_fs_set_dqblk,
 };
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 29f1edca76de..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_DMAPI    "dmapi"         /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_XDSM     "xdsm"          /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI      "dmi"           /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
 /*
 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
                        mp->m_flags |= XFS_MOUNT_DMAPI;
+                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
+                        cmn_err(CE_WARN,
+                                "Enabling EXPERIMENTAL delayed logging feature "
+                                "- use at your own risk.\n");
+                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
                { XFS_MOUNT_DMAPI,              "," MNTOPT_DMAPI },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
+                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +735,8 @@ void
 xfs_blkdev_issue_flush(
        xfs_buftarg_t           *buftarg)
 {
-        blkdev_issue_flush(buftarg->bt_bdev, NULL);
+        blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+                        BLKDEV_IFL_WAIT);
 }
 STATIC void
@@ -789,18 +800,18 @@ xfs_open_devices(
         * Setup xfs_mount buffer target pointers
         */
        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
        if (logdev && logdev != ddev) {
-                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
@@ -902,7 +913,8 @@ xfsaild_start(
        struct xfs_ail  *ailp)
 {
        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+                                    ailp->xa_mount->m_fsname);
        if (IS_ERR(ailp->xa_task))
                return -PTR_ERR(ailp->xa_task);
        return 0;
@@ -1092,6 +1104,7 @@ xfs_fs_write_inode(
                 * the code will only flush the inode if it isn't already
                 * being flushed.
                 */
+                xfs_ioend_wait(ip);
                xfs_ilock(ip, XFS_ILOCK_SHARED);
                if (ip->i_update_core) {
                        error = xfs_log_inode(ip);
@@ -1752,7 +1765,7 @@ xfs_init_zones(void)
         * but it is much faster.
         */
        xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+                                (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
                                  NBWORD) * sizeof(int))), "xfs_buf_item");
        if (!xfs_buf_item_zone)
                goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
-extern struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler *xfs_xattr_handlers[];
 extern const struct quotactl_ops xfs_quotactl_operations;
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a427c638d909..3884e20bc14e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -356,68 +356,23 @@ xfs_commit_dummy_trans(
 STATIC int
 xfs_sync_fsdata(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        int                     flags)
 {
        struct xfs_buf          *bp;
-        struct xfs_buf_log_item *bip;
-        int                     error = 0;
        /*
-         * If this is xfssyncd() then only sync the superblock if we can
+         * If the buffer is pinned then push on the log so we won't get stuck
-         * lock it without sleeping and it is not pinned.
+         * waiting in the write for someone, maybe ourselves, to flush the log.
+         *
+         * Even though we just pushed the log above, we did not have the
+         * superblock buffer locked at that point so it can become pinned in
+         * between there and here.
         */
-        if (flags & SYNC_TRYLOCK) {
+        bp = xfs_getsb(mp, 0);
-                ASSERT(!(flags & SYNC_WAIT));
+        if (XFS_BUF_ISPINNED(bp))
+                xfs_log_force(mp, 0);
-                bp = xfs_getsb(mp, XBF_TRYLOCK);
-                if (!bp)
-                        goto out;
-                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
-                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
-                        goto out_brelse;
-        } else {
-                bp = xfs_getsb(mp, 0);
-                /*
-                 * If the buffer is pinned then push on the log so we won't
-                 * get stuck waiting in the write for someone, maybe
-                 * ourselves, to flush the log.
-                 *
-                 * Even though we just pushed the log above, we did not have
-                 * the superblock buffer locked at that point so it can
-                 * become pinned in between there and here.
-                 */
-                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(mp, 0);
-        }
-        if (flags & SYNC_WAIT)
-                XFS_BUF_UNASYNC(bp);
-        else
-                XFS_BUF_ASYNC(bp);
-        error = xfs_bwrite(mp, bp);
-        if (error)
-                return error;
-        /*
-         * If this is a data integrity sync make sure all pending buffers
-         * are flushed out for the log coverage check below.
-         */
-        if (flags & SYNC_WAIT)
-                xfs_flush_buftarg(mp->m_ddev_targp, 1);
-        if (xfs_log_need_covered(mp))
-                error = xfs_commit_dummy_trans(mp, flags);
-        return error;
- out_brelse:
+        return xfs_bwrite(mp, bp);
-        xfs_buf_relse(bp);
- out:
-        return error;
 }
 /*
@@ -441,7 +396,7 @@ int
 xfs_quiesce_data(
        struct xfs_mount        *mp)
 {
-        int error;
+        int                     error, error2 = 0;
        /* push non-blocking */
        xfs_sync_data(mp, 0);
@@ -452,13 +407,20 @@ xfs_quiesce_data(
        xfs_qm_sync(mp, SYNC_WAIT);
        /* write superblock and hoover up shutdown errors */
-        error = xfs_sync_fsdata(mp, SYNC_WAIT);
+        error = xfs_sync_fsdata(mp);
+        /* make sure all delwri buffers are written out */
+        xfs_flush_buftarg(mp->m_ddev_targp, 1);
+        /* mark the log as covered if needed */
+        if (xfs_log_need_covered(mp))
+                error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
                XFS_bflush(mp->m_rtdev_targp);
-        return error;
+        return error ? error : error2;
 }
 STATIC void
@@ -581,9 +543,9 @@ xfs_flush_inodes(
 }
 /*
- * Every sync period we need to unpin all items, reclaim inodes, sync
+ * Every sync period we need to unpin all items, reclaim inodes and sync
- * quota and write out the superblock. We might need to cover the log
+ * disk quotas.  We might need to cover the log to indicate that the
- * to indicate it is idle.
+ * filesystem is idle.
 */
 STATIC void
 xfs_sync_worker(
@@ -597,7 +559,8 @@ xfs_sync_worker(
                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
+                if (xfs_log_need_covered(mp))
+                        error = xfs_commit_dummy_trans(mp, 0);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
@@ -660,7 +623,7 @@ xfs_syncd_init(
        mp->m_sync_work.w_syncer = xfs_sync_worker;
        mp->m_sync_work.w_mount = mp;
        mp->m_sync_work.w_completion = NULL;
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
        if (IS_ERR(mp->m_sync_task))
                return -PTR_ERR(mp->m_sync_task);
        return 0;
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 5a107601e969..207fa77f63ae 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -41,7 +41,6 @@
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
-#include "xfs_attr_sf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
@@ -50,6 +49,9 @@
 #include "xfs_aops.h"
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
 /*
 * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index fcaa62f0799e..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -32,6 +32,10 @@ struct xfs_da_node_entry;
 struct xfs_dquot;
 struct xlog_ticket;
 struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
        TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(int, count)
+                __field(int, pincount)
                __field(unsigned long, caller_ip)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->count = atomic_read(&VFS_I(ip)->i_count);
+                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx count %d caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
+                  __entry->pincount,
                  (char *)__entry->caller_ip)
 )
@@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \
        TP_ARGS(ip, caller_ip))
 DEFINE_INODE_EVENT(xfs_ihold);
 DEFINE_INODE_EVENT(xfs_irele);
+DEFINE_INODE_EVENT(xfs_inode_pin);
+DEFINE_INODE_EVENT(xfs_inode_unpin);
+DEFINE_INODE_EVENT(xfs_inode_unpin_nowait);
 /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */
 DEFINE_INODE_EVENT(xfs_inode);
 #define xfs_itrace_entry(ip)    \
@@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
        TP_PROTO(struct xfs_dquot *dqp), \
        TP_ARGS(dqp))
 DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqshake_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqshake_unlink);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
 DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
@@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_move);
 DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
@@ -1051,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
 );
+#define XFS_BUSY_SYNC \
+        { 0,    "async" }, \
+        { 1,    "sync" }
 TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_extlen_t len, int slot),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(mp, agno, agbno, len, slot),
+        TP_ARGS(trans, agno, agbno, len, sync),
        TP_STRUCT__entry(
                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(int, tid)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, slot)
+                __field(int, sync)
        ),
        TP_fast_assign(
-                __entry->dev = mp->m_super->s_dev;
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->tid = trans->t_ticket->t_tid;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->slot = slot;
+                __entry->sync = sync;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
+                  __entry->tid,
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __entry->slot)
+                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
 );
-#define XFS_BUSY_STATES \
-        { 0,    "found" }, \
-        { 1,    "missing" }
 TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 int slot, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len),
-        TP_ARGS(mp, agno, slot, found),
+        TP_ARGS(mp, agno, agbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
-                __field(int, slot)
+                __field(xfs_agblock_t, agbno)
-                __field(int, found)
+                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
-                __entry->slot = slot;
+                __entry->agbno = agbno;
-                __entry->found = found;
+                __entry->len = len;
        ),
-        TP_printk("dev %d:%d agno %u slot %d %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
-                  __entry->slot,
+                  __entry->agbno,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->len)
 );
+#define XFS_BUSY_STATES \
+        { 0,    "missing" }, \
+        { 1,    "found" }
 TRACE_EVENT(xfs_alloc_busysearch,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_extlen_t len, xfs_lsn_t lsn),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
-        TP_ARGS(mp, agno, agbno, len, lsn),
+        TP_ARGS(mp, agno, agbno, len, found),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(xfs_lsn_t, lsn)
+                __field(int, found)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->lsn = lsn;
+                __entry->found = found;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
+                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+TRACE_EVENT(xfs_trans_commit_lsn,
+        TP_PROTO(struct xfs_trans *trans),
+        TP_ARGS(trans),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(xfs_lsn_t, lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->lsn = trans->t_commit_lsn;
+        ),
+        TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
                  __entry->lsn)
 );
@@ -1495,6 +1532,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
 DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+        TP_PROTO(struct log *log, struct xlog_recover *trans,
+                struct xlog_recover_item *item, int pass),
+        TP_ARGS(log, trans, item, pass),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(unsigned long, item)
+                __field(xlog_tid_t, tid)
+                __field(int, type)
+                __field(int, pass)
+                __field(int, count)
+                __field(int, total)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->item = (unsigned long)item;
+                __entry->tid = trans->r_log_tid;
+                __entry->type = ITEM_TYPE(item);
+                __entry->pass = pass;
+                __entry->count = item->ri_cnt;
+                __entry->total = item->ri_total;
+        ),
+        TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+                  "item region count/total %d/%d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tid,
+                  __entry->pass,
+                  (void *)__entry->item,
+                  __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+                  __entry->count,
+                  __entry->total)
+)
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+        TP_PROTO(struct log *log, struct xlog_recover *trans, \
+                struct xlog_recover_item *item, int pass), \
+        TP_ARGS(log, trans, item, pass))
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+        TP_ARGS(log, buf_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(__int64_t, blkno)
+                __field(unsigned short, len)
+                __field(unsigned short, flags)
+                __field(unsigned short, size)
+                __field(unsigned int, map_size)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->blkno = buf_f->blf_blkno;
+                __entry->len = buf_f->blf_len;
+                __entry->flags = buf_f->blf_flags;
+                __entry->size = buf_f->blf_size;
+                __entry->map_size = buf_f->blf_map_size;
+        ),
+        TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+                        "map_size %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->flags,
+                  __entry->size,
+                  __entry->map_size)
+)
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+        TP_ARGS(log, buf_f))
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+        TP_ARGS(log, in_f),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __field(unsigned short, size)
+                __field(int, fields)
+                __field(unsigned short, asize)
+                __field(unsigned short, dsize)
+                __field(__int64_t, blkno)
+                __field(int, len)
+                __field(int, boffset)
+        ),
+        TP_fast_assign(
+                __entry->dev = log->l_mp->m_super->s_dev;
+                __entry->ino = in_f->ilf_ino;
+                __entry->size = in_f->ilf_size;
+                __entry->fields = in_f->ilf_fields;
+                __entry->asize = in_f->ilf_asize;
+                __entry->dsize = in_f->ilf_dsize;
+                __entry->blkno = in_f->ilf_blkno;
+                __entry->len = in_f->ilf_len;
+                __entry->boffset = in_f->ilf_boffset;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+                        "dsize %d, blkno 0x%llx, len %d, boffset %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->size,
+                  __entry->fields,
+                  __entry->asize,
+                  __entry->dsize,
+                  __entry->blkno,
+                  __entry->len,
+                  __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+        TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+        TP_ARGS(log, in_f))
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                                (void *)value, size, xflags);
 }
-static struct xattr_handler xfs_xattr_user_handler = {
+static const struct xattr_handler xfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
        .flags  = 0, /* no flags implies user namespace */
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-static struct xattr_handler xfs_xattr_trusted_handler = {
+static const struct xattr_handler xfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .flags  = ATTR_ROOT,
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-static struct xattr_handler xfs_xattr_security_handler = {
+static const struct xattr_handler xfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .flags  = ATTR_SECURE,
        .get    = xfs_xattr_get,
        .set    = xfs_xattr_set,
 };
-struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5f79dd78626b..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
         * No need to re-initialize these if this is a reclaimed dquot.
         */
        if (brandnewdquot) {
-                dqp->dq_flnext = dqp->dq_flprev = dqp;
+                INIT_LIST_HEAD(&dqp->q_freelist);
                mutex_init(&dqp->q_qlock);
                init_waitqueue_head(&dqp->q_pinwait);
@@ -119,20 +119,20 @@ xfs_qm_dqinit(
                 * Only the q_core portion was zeroed in dqreclaim_one().
                 * So, we need to reset others.
                 */
-                 dqp->q_nrefs = 0;
+                dqp->q_nrefs = 0;
-                 dqp->q_blkno = 0;
+                dqp->q_blkno = 0;
-                 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+                INIT_LIST_HEAD(&dqp->q_mplist);
-                 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+                INIT_LIST_HEAD(&dqp->q_hashlist);
-                 dqp->q_bufoffset = 0;
+                dqp->q_bufoffset = 0;
-                 dqp->q_fileoffset = 0;
+                dqp->q_fileoffset = 0;
-                 dqp->q_transp = NULL;
+                dqp->q_transp = NULL;
-                 dqp->q_gdquot = NULL;
+                dqp->q_gdquot = NULL;
-                 dqp->q_res_bcount = 0;
+                dqp->q_res_bcount = 0;
-                 dqp->q_res_icount = 0;
+                dqp->q_res_icount = 0;
-                 dqp->q_res_rtbcount = 0;
+                dqp->q_res_rtbcount = 0;
-                 atomic_set(&dqp->q_pincount, 0);
+                atomic_set(&dqp->q_pincount, 0);
-                 dqp->q_hash = NULL;
+                dqp->q_hash = NULL;
-                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+                ASSERT(list_empty(&dqp->q_freelist));
                trace_xfs_dqreuse(dqp);
        }
@@ -158,7 +158,7 @@ void
 xfs_qm_dqdestroy(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(list_empty(&dqp->q_freelist));
        mutex_destroy(&dqp->q_qlock);
        sv_destroy(&dqp->q_pinwait);
@@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_bcount) >=
                      be64_to_cpu(d->d_blk_hardlimit)))) {
                        d->d_btimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_BTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_btimelimit);
                } else {
                        d->d_bwarns = 0;
                }
@@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_icount) >=
                      be64_to_cpu(d->d_ino_hardlimit)))) {
                        d->d_itimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_ITIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_itimelimit);
                } else {
                        d->d_iwarns = 0;
                }
@@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers(
                     (be64_to_cpu(d->d_rtbcount) >=
                      be64_to_cpu(d->d_rtb_hardlimit)))) {
                        d->d_rtbtimer = cpu_to_be32(get_seconds() +
-                                        XFS_QI_RTBTIMELIMIT(mp));
+                                        mp->m_quotainfo->qi_rtbtimelimit);
                } else {
                        d->d_rtbwarns = 0;
                }
@@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk(
        uint            type,
        xfs_buf_t       *bp)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_dqblk_t     *d;
        int             curid, i;
@@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk(
        /*
         * ID of the first dquot in the block - id's are zero based.
         */
-        curid = id - (id % XFS_QM_DQPERBLK(mp));
+        curid = id - (id % q->qi_dqperchunk);
        ASSERT(curid >= 0);
-        memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+        memset(d, 0, BBTOB(q->qi_dqchunklen));
-        for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+        for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
                xfs_qm_dqinit_core(curid, type, d);
        xfs_trans_dquot_buf(tp, bp,
-                            (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
+                            (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
-                            ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
+                            ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
-                             XFS_BLI_GDQUOT_BUF)));
+                             XFS_BLF_GDQUOT_BUF)));
-        xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
@@ -419,7 +420,7 @@ xfs_qm_dqalloc(
        /* now we can just get the buffer (there's nothing to read yet) */
        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
                               dqp->q_blkno,
-                               XFS_QI_DQCHUNKLEN(mp),
+                               mp->m_quotainfo->qi_dqchunklen,
                               0);
        if (!bp || (error = XFS_BUF_GETERROR(bp)))
                goto error1;
@@ -500,7 +501,8 @@ xfs_qm_dqtobp(
         */
        if (dqp->q_blkno == (xfs_daddr_t) 0) {
                /* We use the id as an index */
-                dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
+                dqp->q_fileoffset = (xfs_fileoff_t)id /
+                                        mp->m_quotainfo->qi_dqperchunk;
                nmaps = 1;
                quotip = XFS_DQ_TO_QIP(dqp);
                xfs_ilock(quotip, XFS_ILOCK_SHARED);
@@ -529,7 +531,7 @@ xfs_qm_dqtobp(
                /*
                 * offset of dquot in the (fixed sized) dquot chunk.
                 */
-                dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+                dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
                        sizeof(xfs_dqblk_t);
                if (map.br_startblock == HOLESTARTBLOCK) {
                        /*
@@ -559,15 +561,13 @@ xfs_qm_dqtobp(
         * Read in the buffer, unless we've just done the allocation
         * (in which case we already have the buf).
         */
-        if (! newdquot) {
+        if (!newdquot) {
                trace_xfs_dqtobp_read(dqp);
-                if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                               dqp->q_blkno,
+                                           dqp->q_blkno,
-                                               XFS_QI_DQCHUNKLEN(mp),
+                                           mp->m_quotainfo->qi_dqchunklen,
-                                               0, &bp))) {
+                                           0, &bp);
-                        return (error);
-                }
                if (error || !bp)
                        return XFS_ERROR(error);
        }
@@ -689,14 +689,14 @@ xfs_qm_idtodq(
        tp = NULL;
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-                if ((error = xfs_trans_reserve(tp,
+                error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                       XFS_QM_DQALLOC_SPACE_RES(mp),
+                                XFS_WRITE_LOG_RES(mp) +
-                                       XFS_WRITE_LOG_RES(mp) +
+                                BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
-                                              BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+                                128,
-                                              128,
+                                0,
-                                       0,
+                                XFS_TRANS_PERM_LOG_RES,
-                                       XFS_TRANS_PERM_LOG_RES,
+                                XFS_WRITE_LOG_COUNT);
-                                       XFS_WRITE_LOG_COUNT))) {
+                if (error) {
                        cancelflags = 0;
                        goto error0;
                }
@@ -751,7 +751,6 @@ xfs_qm_dqlookup(
 {
        xfs_dquot_t             *dqp;
        uint                    flist_locked;
-        xfs_dquot_t             *d;
        ASSERT(mutex_is_locked(&qh->qh_lock));
@@ -760,7 +759,7 @@ xfs_qm_dqlookup(
        /*
         * Traverse the hashchain looking for a match
         */
-        for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+        list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
                /*
                 * We already have the hashlock. We don't need the
                 * dqlock to look at the id field of the dquot, since the
@@ -772,12 +771,12 @@ xfs_qm_dqlookup(
                        /*
                         * All in core dquots must be on the dqlist of mp
                         */
-                        ASSERT(dqp->MPL_PREVP != NULL);
+                        ASSERT(!list_empty(&dqp->q_mplist));
                        xfs_dqlock(dqp);
                        if (dqp->q_nrefs == 0) {
-                                ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+                                ASSERT(!list_empty(&dqp->q_freelist));
-                                if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+                                if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                                        trace_xfs_dqlookup_want(dqp);
                                        /*
@@ -787,7 +786,7 @@ xfs_qm_dqlookup(
                                         */
                                        dqp->dq_flags |= XFS_DQ_WANT;
                                        xfs_dqunlock(dqp);
-                                        xfs_qm_freelist_lock(xfs_Gqm);
+                                        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                                        xfs_dqlock(dqp);
                                        dqp->dq_flags &= ~(XFS_DQ_WANT);
                                }
@@ -802,46 +801,28 @@ xfs_qm_dqlookup(
                        if (flist_locked) {
                                if (dqp->q_nrefs != 0) {
-                                        xfs_qm_freelist_unlock(xfs_Gqm);
+                                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                                        flist_locked = B_FALSE;
                                } else {
-                                        /*
+                                        /* take it off the freelist */
-                                         * take it off the freelist
-                                         */
                                        trace_xfs_dqlookup_freelist(dqp);
-                                        XQM_FREELIST_REMOVE(dqp);
+                                        list_del_init(&dqp->q_freelist);
-                                        /* xfs_qm_freelist_print(&(xfs_Gqm->
+                                        xfs_Gqm->qm_dqfrlist_cnt--;
-                                                        qm_dqfreelist),
-                                                        "after removal"); */
                                }
                        }
-                        /*
-                         * grab a reference
-                         */
                        XFS_DQHOLD(dqp);
                        if (flist_locked)
-                                xfs_qm_freelist_unlock(xfs_Gqm);
+                                mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        /*
                         * move the dquot to the front of the hashchain
                         */
                        ASSERT(mutex_is_locked(&qh->qh_lock));
-                        if (dqp->HL_PREVP != &qh->qh_next) {
+                        list_move(&dqp->q_hashlist, &qh->qh_list);
-                                trace_xfs_dqlookup_move(dqp);
-                                if ((d = dqp->HL_NEXT))
-                                        d->HL_PREVP = dqp->HL_PREVP;
-                                *(dqp->HL_PREVP) = d;
-                                d = qh->qh_next;
-                                d->HL_PREVP = &dqp->HL_NEXT;
-                                dqp->HL_NEXT = d;
-                                dqp->HL_PREVP = &qh->qh_next;
-                                qh->qh_next = dqp;
-                        }
                        trace_xfs_dqlookup_done(dqp);
                        *O_dqpp = dqp;
-                        ASSERT(mutex_is_locked(&qh->qh_lock));
+                        return 0;
-                        return (0);
                }
        }
@@ -975,16 +956,17 @@ xfs_qm_dqget(
         */
        if (ip) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (! XFS_IS_DQTYPE_ON(mp, type)) {
-                        /* inode stays locked on return */
-                        xfs_qm_dqdestroy(dqp);
-                        return XFS_ERROR(ESRCH);
-                }
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
                if (type == XFS_DQ_USER) {
+                        if (!XFS_IS_UQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_udquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_udquot;
@@ -992,6 +974,11 @@ xfs_qm_dqget(
                                goto dqret;
                        }
                } else {
+                        if (!XFS_IS_OQUOTA_ON(mp)) {
+                                /* inode stays locked on return */
+                                xfs_qm_dqdestroy(dqp);
+                                return XFS_ERROR(ESRCH);
+                        }
                        if (ip->i_gdquot) {
                                xfs_qm_dqdestroy(dqp);
                                dqp = ip->i_gdquot;
@@ -1033,13 +1020,14 @@ xfs_qm_dqget(
         */
        ASSERT(mutex_is_locked(&h->qh_lock));
        dqp->q_hash = h;
-        XQM_HASHLIST_INSERT(h, dqp);
+        list_add(&dqp->q_hashlist, &h->qh_list);
+        h->qh_version++;
        /*
         * Attach this dquot to this filesystem's list of all dquots,
         * kept inside the mount structure in m_quotainfo field
         */
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
        /*
         * We return a locked dquot to the caller, with a reference taken
@@ -1047,9 +1035,9 @@ xfs_qm_dqget(
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
-        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+        mp->m_quotainfo->qi_dquots++;
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
        mutex_unlock(&h->qh_lock);
 dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1086,10 +1074,10 @@ xfs_qm_dqput(
         * drop the dqlock and acquire the freelist and dqlock
         * in the right order; but try to get it out-of-order first
         */
-        if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+        if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
                trace_xfs_dqput_wait(dqp);
                xfs_dqunlock(dqp);
-                xfs_qm_freelist_lock(xfs_Gqm);
+                mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
                xfs_dqlock(dqp);
        }
@@ -1100,10 +1088,8 @@ xfs_qm_dqput(
                if (--dqp->q_nrefs == 0) {
                        trace_xfs_dqput_free(dqp);
-                        /*
+                        list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
-                         * insert at end of the freelist.
+                        xfs_Gqm->qm_dqfrlist_cnt++;
-                         */
-                        XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
                        /*
                         * If we just added a udquot to the freelist, then
@@ -1118,10 +1104,6 @@ xfs_qm_dqput(
                                xfs_dqlock(gdqp);
                                dqp->q_gdquot = NULL;
                        }
-                        /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
-                           "@@@@@++ Free list (after append) @@@@@+");
-                           */
                }
                xfs_dqunlock(dqp);
@@ -1133,7 +1115,7 @@ xfs_qm_dqput(
                        break;
                dqp = gdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 }
 /*
@@ -1386,10 +1368,10 @@ int
 xfs_qm_dqpurge(
        xfs_dquot_t     *dqp)
 {
-        xfs_dqhash_t    *thishash;
+        xfs_dqhash_t    *qh = dqp->q_hash;
        xfs_mount_t     *mp = dqp->q_mount;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
        ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
        xfs_dqlock(dqp);
@@ -1407,7 +1389,7 @@ xfs_qm_dqpurge(
                return (1);
        }
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        /*
         * If we're turning off quotas, we have to make sure that, for
@@ -1452,14 +1434,16 @@ xfs_qm_dqpurge(
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-        thishash = dqp->q_hash;
+        list_del_init(&dqp->q_hashlist);
-        XQM_HASHLIST_REMOVE(thishash, dqp);
+        qh->qh_version++;
-        XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+        list_del_init(&dqp->q_mplist);
+        mp->m_quotainfo->qi_dqreclaims++;
+        mp->m_quotainfo->qi_dquots--;
        /*
         * XXX Move this to the front of the freelist, if we can get the
         * freelist lock.
         */
-        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        ASSERT(!list_empty(&dqp->q_freelist));
        dqp->q_mount = NULL;
        dqp->q_hash = NULL;
@@ -1467,7 +1451,7 @@ xfs_qm_dqpurge(
        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-        mutex_unlock(&thishash->qh_lock);
+        mutex_unlock(&qh->qh_lock);
        return (0);
 }
@@ -1517,6 +1501,7 @@ void
 xfs_qm_dqflock_pushbuf_wait(
        xfs_dquot_t     *dqp)
 {
+        xfs_mount_t     *mp = dqp->q_mount;
        xfs_buf_t       *bp;
        /*
@@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait(
         * out immediately.  We'll be able to acquire
         * the flush lock when the I/O completes.
         */
-        bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+        bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
-                    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        if (!bp)
                goto out_lock;
        if (XFS_BUF_ISDELAYWRITE(bp)) {
                if (XFS_BUF_ISPINNED(bp))
-                        xfs_log_force(dqp->q_mount, 0);
+                        xfs_log_force(mp, 0);
                xfs_buf_delwri_promote(bp);
                wake_up_process(bp->b_target->bt_task);
        }
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index a0f7da586d1b..5da3a23b820d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -33,40 +33,23 @@
 * The hash chain headers (hash buckets)
 */
 typedef struct xfs_dqhash {
-        struct xfs_dquot *qh_next;
+        struct list_head  qh_list;
        struct mutex      qh_lock;
        uint              qh_version;   /* ever increasing version */
        uint              qh_nelems;    /* number of dquots on the list */
 } xfs_dqhash_t;
-typedef struct xfs_dqlink {
-        struct xfs_dquot  *ql_next;     /* forward link */
-        struct xfs_dquot **ql_prevp;    /* pointer to prev ql_next */
-} xfs_dqlink_t;
 struct xfs_mount;
 struct xfs_trans;
 /*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
-        struct xfs_dquot*dqm_flnext;    /* link to freelist: must be first */
-        struct xfs_dquot*dqm_flprev;
-        xfs_dqlink_t     dqm_mplist;    /* link to mount's list of dquots */
-        xfs_dqlink_t     dqm_hashlist;  /* link to the hash chain */
-        uint             dqm_flags;     /* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-/*
 * The incore dquot structure
 */
 typedef struct xfs_dquot {
-        xfs_dqmarker_t   q_lists;       /* list ptrs, q_flags (marker) */
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_freelist;    /* global free list of dquots */
+        struct list_head q_mplist;      /* mount's list of dquots */
+        struct list_head q_hashlist;    /* gloabl hash list of dquots */
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
@@ -87,13 +70,6 @@ typedef struct xfs_dquot {
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 } xfs_dquot_t;
-#define dq_flnext       q_lists.dqm_flnext
-#define dq_flprev       q_lists.dqm_flprev
-#define dq_mplist       q_lists.dqm_mplist
-#define dq_hashlist     q_lists.dqm_hashlist
-#define dq_flags        q_lists.dqm_flags
 /*
 * Lock hierarchy for q_qlock:
 *      XFS_QLOCK_NORMAL is the implicit default,
@@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
 }
 #define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_PROJ)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 4e4ee9a57194..8d89a24ae324 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin(
 /* ARGSUSED */
 STATIC void
 xfs_qm_dquot_logitem_unpin(
-        xfs_dq_logitem_t *logitem,
+        xfs_dq_logitem_t *logitem)
-        int               stale)
 {
        xfs_dquot_t *dqp = logitem->qli_dquot;
@@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove(
        xfs_dq_logitem_t *logitem,
        xfs_trans_t      *tp)
 {
-        xfs_qm_dquot_logitem_unpin(logitem, 0);
+        xfs_qm_dquot_logitem_unpin(logitem);
 }
 /*
@@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf(
        }
        mp = dqp->q_mount;
        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-                    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
+                        mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
                return;
@@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_dquot_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
-                                        xfs_qm_dquot_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_qm_dquot_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))
@@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init(
        xfs_dq_logitem_t  *lp;
        lp = &dqp->q_logitem;
-        lp->qli_item.li_type = XFS_LI_DQUOT;
+        xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
-        lp->qli_item.li_ops = &xfs_dquot_item_ops;
+                                        &xfs_dquot_item_ops);
-        lp->qli_item.li_mountp = dqp->q_mount;
        lp->qli_dquot = dqp;
        lp->qli_format.qlf_type = XFS_LI_DQUOT;
        lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
@@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
 {
        return;
 }
@@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_qoff_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t* ,int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-                                        xfs_qm_qoff_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
                                        xfs_qm_qoff_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_qm_qoff_logitem_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
-                                        xfs_qm_qoff_logitem_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
                                        xfs_qm_qoff_logitem_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
@@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init(
        qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
-        qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+        xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
-        if (start)
+                        &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
-                qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
-        else
-                qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
        qf->qql_item.li_mountp = mp;
        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
        qf->qql_format.qf_flags = flags;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 417e61e3d9dd..38e764146644 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -67,9 +67,6 @@ static cred_t	xfs_zerocr;
 STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
 STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
-STATIC void     xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void     xfs_qm_freelist_destroy(xfs_frlist_t *);
 STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int      xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int      xfs_qm_shake(int, gfp_t);
@@ -84,21 +81,25 @@ extern struct mutex	qcheck_lock;
 #endif
 #ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
+static void
-{ \
+xfs_qm_dquot_list_print(
-        xfs_dquot_t     *dqp; int i = 0; \
+        struct xfs_mount *mp)
-        cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+{
-        for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+        xfs_dquot_t     *dqp;
-                cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
+        int             i = 0;
-                                  "bcnt = %d, icnt = %d, refs = %d", \
-                        ++i, (int) be32_to_cpu(dqp->q_core.d_id), \
+        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                        DQFLAGTO_TYPESTR(dqp),       \
+                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
-                        (int) be64_to_cpu(dqp->q_core.d_bcount), \
+                                  "bcnt = %lld, icnt = %lld, refs = %d",
-                        (int) be64_to_cpu(dqp->q_core.d_icount), \
+                        i++, be32_to_cpu(dqp->q_core.d_id),
-                        (int) dqp->q_nrefs);  } \
+                        DQFLAGTO_TYPESTR(dqp),
+                        (long long)be64_to_cpu(dqp->q_core.d_bcount),
+                        (long long)be64_to_cpu(dqp->q_core.d_icount),
+                        dqp->q_nrefs);
+        }
 }
 #else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { }
 #endif
 /*
@@ -144,7 +145,9 @@ xfs_Gqm_init(void)
        /*
         * Freelist of all dquots of all file systems
         */
-        xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+        INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+        xqm->qm_dqfrlist_cnt = 0;
+        mutex_init(&xqm->qm_dqfrlist_lock);
        /*
         * dquot zone. we register our own low-memory callback.
@@ -189,6 +192,7 @@ STATIC void
 xfs_qm_destroy(
        struct xfs_qm   *xqm)
 {
+        struct xfs_dquot *dqp, *n;
        int             hsize, i;
        ASSERT(xqm != NULL);
@@ -204,7 +208,21 @@ xfs_qm_destroy(
        xqm->qm_usr_dqhtable = NULL;
        xqm->qm_grp_dqhtable = NULL;
        xqm->qm_dqhashmask = 0;
-        xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+        /* frlist cleanup */
+        mutex_lock(&xqm->qm_dqfrlist_lock);
+        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+                xfs_dqlock(dqp);
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                xfs_dqunlock(dqp);
+                xfs_qm_dqdestroy(dqp);
+        }
+        mutex_unlock(&xqm->qm_dqfrlist_lock);
+        mutex_destroy(&xqm->qm_dqfrlist_lock);
 #ifdef DEBUG
        mutex_destroy(&qcheck_lock);
 #endif
@@ -256,7 +274,7 @@ STATIC void
 xfs_qm_rele_quotafs_ref(
        struct xfs_mount *mp)
 {
-        xfs_dquot_t     *dqp, *nextdqp;
+        xfs_dquot_t     *dqp, *n;
        ASSERT(xfs_Gqm);
        ASSERT(xfs_Gqm->qm_nrefs > 0);
@@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref(
        /*
         * Go thru the freelist and destroy all inactive dquots.
         */
-        xfs_qm_freelist_lock(xfs_Gqm);
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
                        ASSERT(dqp->q_mount == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
-                        XQM_FREELIST_REMOVE(dqp);
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
                        xfs_dqunlock(dqp);
                        xfs_qm_dqdestroy(dqp);
                } else {
                        xfs_dqunlock(dqp);
                }
-                dqp = nextdqp;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
        /*
         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
@@ -305,7 +321,7 @@ xfs_qm_unmount(
        struct xfs_mount        *mp)
 {
        if (mp->m_quotainfo) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                xfs_qm_destroy_quotainfo(mp);
        }
 }
@@ -449,20 +465,21 @@ xfs_qm_unmount_quotas(
 */
 STATIC int
 xfs_qm_dqflush_all(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             sync_mode)
+        int                     sync_mode)
 {
-        int             recl;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl;
-        int             niters;
+        struct xfs_dquot        *dqp;
-        int             error;
+        int                     niters;
+        int                     error;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        niters = 0;
 again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                xfs_dqlock(dqp);
                if (! XFS_DQ_IS_DIRTY(dqp)) {
                        xfs_dqunlock(dqp);
@@ -470,7 +487,7 @@ again:
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
@@ -485,21 +502,21 @@ again:
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write.
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, sync_mode);
                xfs_dqunlock(dqp);
                if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        /* XXX restart limit */
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        /* return ! busy */
        return 0;
 }
@@ -509,15 +526,15 @@ again:
 */
 STATIC void
 xfs_qm_detach_gdquots(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_dquot_t     *dqp, *gdqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        int             nrecl;
+        struct xfs_dquot        *dqp, *gdqp;
+        int                     nrecl;
 again:
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                xfs_dqlock(dqp);
                if ((gdqp = dqp->q_gdquot)) {
                        xfs_dqlock(gdqp);
@@ -530,15 +547,14 @@ xfs_qm_detach_gdquots(
                         * Can't hold the mplist lock across a dqput.
                         * XXXmust convert to marker based iterations here.
                         */
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        xfs_qm_dqput(gdqp);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+                        if (nrecl != q->qi_dqreclaims)
                                goto again;
                }
-                dqp = dqp->MPL_NEXT;
        }
 }
@@ -550,23 +566,23 @@ xfs_qm_detach_gdquots(
 */
 STATIC int
 xfs_qm_dqpurge_int(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        uint            flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
+        uint                    flags)
 {
-        xfs_dquot_t     *dqp;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        uint            dqtype;
+        struct xfs_dquot        *dqp, *n;
-        int             nrecl;
+        uint                    dqtype;
-        xfs_dquot_t     *nextdqp;
+        int                     nrecl;
-        int             nmisses;
+        int                     nmisses;
-        if (mp->m_quotainfo == NULL)
+        if (!q)
                return 0;
        dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
        dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
        dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * In the first pass through all incore dquots of this filesystem,
@@ -578,28 +594,25 @@ xfs_qm_dqpurge_int(
      again:
        nmisses = 0;
-        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
        /*
         * Try to get rid of all of the unwanted dquots. The idea is to
         * get them off mplist and hashlist, but leave them on freelist.
         */
-        dqp = XFS_QI_MPLNEXT(mp);
+        list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-        while (dqp) {
                /*
                 * It's OK to look at the type without taking dqlock here.
                 * We're holding the mplist lock here, and that's needed for
                 * a dqreclaim.
                 */
-                if ((dqp->dq_flags & dqtype) == 0) {
+                if ((dqp->dq_flags & dqtype) == 0)
-                        dqp = dqp->MPL_NEXT;
                        continue;
-                }
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        nrecl = q->qi_dqreclaims;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        mutex_lock(&dqp->q_hash->qh_lock);
-                        xfs_qm_mplist_lock(mp);
+                        mutex_lock(&q->qi_dqlist_lock);
                        /*
                         * XXXTheoretically, we can get into a very long
@@ -607,7 +620,7 @@ xfs_qm_dqpurge_int(
                         * No one can be adding dquots to the mplist at
                         * this point, but somebody might be taking things off.
                         */
-                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+                        if (nrecl != q->qi_dqreclaims) {
                                mutex_unlock(&dqp->q_hash->qh_lock);
                                goto again;
                        }
@@ -617,11 +630,9 @@ xfs_qm_dqpurge_int(
                 * Take the dquot off the mplist and hashlist. It may remain on
                 * freelist in INACTIVE state.
                 */
-                nextdqp = dqp->MPL_NEXT;
                nmisses += xfs_qm_dqpurge(dqp);
-                dqp = nextdqp;
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return nmisses;
 }
@@ -921,12 +932,13 @@ xfs_qm_dqdetach(
 int
 xfs_qm_sync(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             flags)
+        int                     flags)
 {
-        int             recl, restarts;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        xfs_dquot_t     *dqp;
+        int                     recl, restarts;
-        int             error;
+        struct xfs_dquot        *dqp;
+        int                     error;
        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
@@ -934,18 +946,19 @@ xfs_qm_sync(
        restarts = 0;
  again:
-        xfs_qm_mplist_lock(mp);
+        mutex_lock(&q->qi_dqlist_lock);
        /*
         * dqpurge_all() also takes the mplist lock and iterate thru all dquots
         * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
         * when we have the mplist lock, we know that dquots will be consistent
         * as long as we have it locked.
         */
-        if (! XFS_IS_QUOTA_ON(mp)) {
+        if (!XFS_IS_QUOTA_ON(mp)) {
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                return 0;
        }
-        FOREACH_DQUOT_IN_MP(dqp, mp) {
+        ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
                /*
                 * If this is vfs_sync calling, then skip the dquots that
                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
@@ -969,7 +982,7 @@ xfs_qm_sync(
                }
                /* XXX a sentinel would be better */
-                recl = XFS_QI_MPLRECLAIMS(mp);
+                recl = q->qi_dqreclaims;
                if (!xfs_dqflock_nowait(dqp)) {
                        if (flags & SYNC_TRYLOCK) {
                                xfs_dqunlock(dqp);
@@ -989,7 +1002,7 @@ xfs_qm_sync(
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write
                 */
-                xfs_qm_mplist_unlock(mp);
+                mutex_unlock(&q->qi_dqlist_lock);
                error = xfs_qm_dqflush(dqp, flags);
                xfs_dqunlock(dqp);
                if (error && XFS_FORCED_SHUTDOWN(mp))
@@ -997,17 +1010,17 @@ xfs_qm_sync(
                else if (error)
                        return error;
-                xfs_qm_mplist_lock(mp);
+                mutex_lock(&q->qi_dqlist_lock);
-                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                if (recl != q->qi_dqreclaims) {
                        if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
                                break;
-                        xfs_qm_mplist_unlock(mp);
+                        mutex_unlock(&q->qi_dqlist_lock);
                        goto again;
                }
        }
-        xfs_qm_mplist_unlock(mp);
+        mutex_unlock(&q->qi_dqlist_lock);
        return 0;
 }
@@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+        INIT_LIST_HEAD(&qinf->qi_dqlist);
-        lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+        mutex_init(&qinf->qi_dqlist_lock);
+        lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
        qinf->qi_dqreclaims = 0;
@@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        xfs_qm_list_destroy(&qi->qi_dqlist);
+        ASSERT(list_empty(&qi->qi_dqlist));
+        mutex_destroy(&qi->qi_dqlist_lock);
        if (qi->qi_uquotaip) {
                IRELE(qi->qi_uquotaip);
@@ -1177,7 +1192,7 @@ xfs_qm_list_init(
        int             n)
 {
        mutex_init(&list->qh_lock);
-        list->qh_next = NULL;
+        INIT_LIST_HEAD(&list->qh_list);
        list->qh_version = 0;
        list->qh_nelems = 0;
 }
@@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc(
         */
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
                                   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
@@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc(
                /* qflags will get updated _after_ quotacheck */
                mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-                cmn_err(CE_NOTE,
-                        "Old superblock version %x, converting to %x.",
-                        oldv, mp->m_sb.sb_versionnum);
-#endif
        }
        if (flags & XFS_QMOPT_UQUOTA)
                mp->m_sb.sb_uquotino = (*ip)->i_ino;
@@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts(
 #ifdef DEBUG
        j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
        do_div(j, sizeof(xfs_dqblk_t));
-        ASSERT(XFS_QM_DQPERBLK(mp) == j);
+        ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
-        for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+        for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
                /*
                 * Do a sanity check, and if needed, repair the dqblk. Don't
                 * output any warnings because it's perfectly possible to
@@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs(
        while (blkcnt--) {
                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                              XFS_FSB_TO_DADDR(mp, bno),
-                              (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+                              mp->m_quotainfo->qi_dqchunklen, 0, &bp);
                if (error)
                        break;
@@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs(
                 * goto the next block.
                 */
                bno++;
-                firstid += XFS_QM_DQPERBLK(mp);
+                firstid += mp->m_quotainfo->qi_dqperchunk;
        }
        return error;
 }
@@ -1505,7 +1512,7 @@ xfs_qm_dqiterate(
                                continue;
                        firstid = (xfs_dqid_t) map[i].br_startoff *
-                                XFS_QM_DQPERBLK(mp);
+                                mp->m_quotainfo->qi_dqperchunk;
                        /*
                         * Do a read-ahead on the next extent.
                         */
@@ -1516,7 +1523,7 @@ xfs_qm_dqiterate(
                                while (rablkcnt--) {
                                        xfs_baread(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
-                                               (int)XFS_QI_DQCHUNKLEN(mp));
+                                               mp->m_quotainfo->qi_dqchunklen);
                                        rablkno++;
                                }
                        }
@@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust(
        /*
         * Set default limits, adjust timers (since we changed usages)
+         *
+         * There are no timers for the default values set in the root dquot.
         */
-        if (! XFS_IS_SUSER_DQUOT(dqp)) {
+        if (dqp->q_core.d_id) {
                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
        }
@@ -1747,14 +1756,14 @@ xfs_qm_quotacheck(
        lastino = 0;
        flags = 0;
-        ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        /*
         * There should be no cached dquots. The (simplistic) quotacheck
         * algorithm doesn't like that.
         */
-        ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
@@ -1763,15 +1772,19 @@ xfs_qm_quotacheck(
         * their counters to zero. We need a clean slate.
         * We don't log our changes till later.
         */
-        if ((uip = XFS_QI_UQIP(mp))) {
+        uip = mp->m_quotainfo->qi_uquotaip;
-                if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+        if (uip) {
+                error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_UQUOTA_CHKD;
        }
-        if ((gip = XFS_QI_GQIP(mp))) {
+        gip = mp->m_quotainfo->qi_gquotaip;
-                if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+        if (gip) {
-                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
+                error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+                                        XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+                if (error)
                        goto error_return;
                flags |= XFS_OQUOTA_CHKD;
        }
@@ -1804,7 +1817,7 @@ xfs_qm_quotacheck(
         * at this point (because we intentionally didn't in dqget_noattach).
         */
        if (error) {
-                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
                goto error_return;
        }
@@ -1825,7 +1838,7 @@ xfs_qm_quotacheck(
        mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
        mp->m_qflags |= flags;
-        XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+        xfs_qm_dquot_list_print(mp);
 error_return:
        if (error) {
@@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos(
                }
        }
-        XFS_QI_UQIP(mp) = uip;
+        mp->m_quotainfo->qi_uquotaip = uip;
-        XFS_QI_GQIP(mp) = gip;
+        mp->m_quotainfo->qi_gquotaip = gip;
        return 0;
 }
 /*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * Just pop the least recently used dquot off the freelist and
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * recycle it. The returned dquot is locked.
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
 */
-STATIC int
+STATIC xfs_dquot_t *
-xfs_qm_shake_freelist(
+xfs_qm_dqreclaim_one(void)
-        int howmany)
 {
-        int             nreclaimed;
+        xfs_dquot_t     *dqpout;
-        xfs_dqhash_t    *hash;
+        xfs_dquot_t     *dqp;
-        xfs_dquot_t     *dqp, *nextdqp;
        int             restarts;
-        int             nflushes;
-        if (howmany <= 0)
-                return 0;
-        nreclaimed = 0;
        restarts = 0;
-        nflushes = 0;
+        dqpout = NULL;
-#ifdef QUOTADEBUG
+        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-        cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
+startagain:
-#endif
+        mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-        /* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+        list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-             ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+                struct xfs_mount *mp = dqp->q_mount;
-              nreclaimed < howmany); ) {
                xfs_dqlock(dqp);
                /*
                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants.
+                 * want to reclaim a dquot that lookup wants. We release the
+                 * freelist lock and start over, so that lookup will grab
+                 * both the dquot and the freelistlock.
                 */
                if (dqp->dq_flags & XFS_DQ_WANT) {
+                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+                        trace_xfs_dqreclaim_want(dqp);
                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return nreclaimed;
+                                return NULL;
                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto tryagain;
+                        goto startagain;
                }
                /*
@@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist(
                 * life easier.
                 */
                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(mp == NULL);
                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_hashlist));
-                        ASSERT(dqp->MPL_PREVP == NULL);
+                        ASSERT(list_empty(&dqp->q_mplist));
+                        list_del_init(&dqp->q_freelist);
+                        xfs_Gqm->qm_dqfrlist_cnt--;
+                        xfs_dqunlock(dqp);
+                        dqpout = dqp;
                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        nextdqp = dqp->dq_flnext;
+                        break;
-                        goto off_freelist;
                }
-                ASSERT(dqp->MPL_PREVP);
+                ASSERT(dqp->q_hash);
+                ASSERT(!list_empty(&dqp->q_mplist));
                /*
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
-                        dqp = dqp->dq_flnext;
                        continue;
                }
@@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist(
                if (XFS_DQ_IS_DIRTY(dqp)) {
                        int     error;
-                        trace_xfs_dqshake_dirty(dqp);
+                        trace_xfs_dqreclaim_dirty(dqp);
                        /*
                         * We flush it delayed write, so don't bother
-                         * releasing the mplock.
+                         * releasing the freelist lock.
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                                xfs_fs_cmn_err(CE_WARN, mp,
-                        "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
+                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
                        }
                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        dqp = dqp->dq_flnext;
                        continue;
                }
                /*
                 * We're trying to get the hashlock out of order. This races
                 * with dqlookup; so, we giveup and goto the next dquot if
@@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist(
                 * waiting for the freelist lock.
                 */
                if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                        xfs_dqfunlock(dqp);
+                        restarts++;
-                        xfs_dqunlock(dqp);
+                        goto dqfunlock;
-                        dqp = dqp->dq_flnext;
-                        continue;
                }
                /*
                 * This races with dquot allocation code as well as dqflush_all
                 * and reclaim code. So, if we failed to grab the mplist lock,
                 * giveup everything and start over.
                 */
-                hash = dqp->q_hash;
+                if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
-                ASSERT(hash);
+                        restarts++;
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+                        mutex_unlock(&dqp->q_hash->qh_lock);
-                        /* XXX put a sentinel so that we can come back here */
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
-                        mutex_unlock(&hash->qh_lock);
+                        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return NULL;
-                                return nreclaimed;
+                        goto startagain;
-                        goto tryagain;
                }
-                trace_xfs_dqshake_unlink(dqp);
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
-                        dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
                ASSERT(dqp->q_nrefs == 0);
-                nextdqp = dqp->dq_flnext;
+                list_del_init(&dqp->q_mplist);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+                mp->m_quotainfo->qi_dquots--;
-                XQM_HASHLIST_REMOVE(hash, dqp);
+                mp->m_quotainfo->qi_dqreclaims++;
+                list_del_init(&dqp->q_hashlist);
+                dqp->q_hash->qh_version++;
+                list_del_init(&dqp->q_freelist);
+                xfs_Gqm->qm_dqfrlist_cnt--;
+                dqpout = dqp;
+                mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+                mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
                xfs_dqfunlock(dqp);
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                mutex_unlock(&hash->qh_lock);
- off_freelist:
-                XQM_FREELIST_REMOVE(dqp);
                xfs_dqunlock(dqp);
-                nreclaimed++;
+                if (dqpout)
-                XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+                        break;
+                if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                        return NULL;
+        }
+        mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+        return dqpout;
+}
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+        int     howmany)
+{
+        int             nreclaimed = 0;
+        xfs_dquot_t     *dqp;
+        if (howmany <= 0)
+                return 0;
+        while (nreclaimed < howmany) {
+                dqp = xfs_qm_dqreclaim_one();
+                if (!dqp)
+                        return nreclaimed;
                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
+                nreclaimed++;
        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
        return nreclaimed;
 }
 /*
 * The kmem_shake interface is invoked when memory is running low.
 */
@@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
        if (!xfs_Gqm)
                return 0;
-        nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+        nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
        /* incore dquots in all f/s's */
        ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
@@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
 }
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-        xfs_dquot_t     *dqpout;
-        xfs_dquot_t     *dqp;
-        int             restarts;
-        int             nflushes;
-        restarts = 0;
-        dqpout = NULL;
-        nflushes = 0;
-        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
-        xfs_qm_freelist_lock(xfs_Gqm);
-        FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
-                xfs_dqlock(dqp);
-                /*
-                 * We are racing with dqlookup here. Naturally we don't
-                 * want to reclaim a dquot that lookup wants. We release the
-                 * freelist lock and start over, so that lookup will grab
-                 * both the dquot and the freelistlock.
-                 */
-                if (dqp->dq_flags & XFS_DQ_WANT) {
-                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-                        trace_xfs_dqreclaim_want(dqp);
-                        xfs_dqunlock(dqp);
-                        xfs_qm_freelist_unlock(xfs_Gqm);
-                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                                return NULL;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                        goto startagain;
-                }
-                /*
-                 * If the dquot is inactive, we are assured that it is
-                 * not on the mplist or the hashlist, and that makes our
-                 * life easier.
-                 */
-                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                        ASSERT(dqp->q_mount == NULL);
-                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                        ASSERT(dqp->HL_PREVP == NULL);
-                        ASSERT(dqp->MPL_PREVP == NULL);
-                        XQM_FREELIST_REMOVE(dqp);
-                        xfs_dqunlock(dqp);
-                        dqpout = dqp;
-                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                        break;
-                }
-                ASSERT(dqp->q_hash);
-                ASSERT(dqp->MPL_PREVP);
-                /*
-                 * Try to grab the flush lock. If this dquot is in the process of
-                 * getting flushed to disk, we don't want to reclaim it.
-                 */
-                if (!xfs_dqflock_nowait(dqp)) {
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                /*
-                 * We have the flush lock so we know that this is not in the
-                 * process of being flushed. So, if this is dirty, flush it
-                 * DELWRI so that we don't get a freelist infested with
-                 * dirty dquots.
-                 */
-                if (XFS_DQ_IS_DIRTY(dqp)) {
-                        int     error;
-                        trace_xfs_dqreclaim_dirty(dqp);
-                        /*
-                         * We flush it delayed write, so don't bother
-                         * releasing the freelist lock.
-                         */
-                        error = xfs_qm_dqflush(dqp, 0);
-                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
-                        }
-                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-                        continue;
-                }
-                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-                        xfs_dqfunlock(dqp);
-                        xfs_dqunlock(dqp);
-                        continue;
-                }
-                if (!mutex_trylock(&dqp->q_hash->qh_lock))
-                        goto mplistunlock;
-                trace_xfs_dqreclaim_unlink(dqp);
-                ASSERT(dqp->q_nrefs == 0);
-                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
-                XQM_FREELIST_REMOVE(dqp);
-                dqpout = dqp;
-                mutex_unlock(&dqp->q_hash->qh_lock);
- mplistunlock:
-                xfs_qm_mplist_unlock(dqp->q_mount);
-                xfs_dqfunlock(dqp);
-                xfs_dqunlock(dqp);
-                if (dqpout)
-                        break;
-        }
-        xfs_qm_freelist_unlock(xfs_Gqm);
-        return dqpout;
-}
 /*------------------------------------------------------------------*/
 /*
@@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach(
        }
 }
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
-        ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-        mutex_init(&ql->qh_lock);
-        ql->qh_version = 0;
-        ql->qh_nelems = 0;
-}
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
-        xfs_dquot_t     *dqp, *nextdqp;
-        mutex_lock(&ql->qh_lock);
-        for (dqp = ql->qh_next;
-             dqp != (xfs_dquot_t *)ql; ) {
-                xfs_dqlock(dqp);
-                nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
-                XQM_FREELIST_REMOVE(dqp);
-                xfs_dqunlock(dqp);
-                xfs_qm_dqdestroy(dqp);
-                dqp = nextdqp;
-        }
-        mutex_unlock(&ql->qh_lock);
-        mutex_destroy(&ql->qh_lock);
-        ASSERT(ql->qh_nelems == 0);
-}
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        dq->dq_flnext = ql->qh_next;
-        dq->dq_flprev = (xfs_dquot_t *)ql;
-        ql->qh_next = dq;
-        dq->dq_flnext->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems++;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
-        xfs_dquot_t *next = dq->dq_flnext;
-        xfs_dquot_t *prev = dq->dq_flprev;
-        next->dq_flprev = prev;
-        prev->dq_flnext = next;
-        dq->dq_flnext = dq->dq_flprev = dq;
-        xfs_Gqm->qm_dqfreelist.qh_nelems--;
-        xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 495564b8af38..c9446f1c726d 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -72,17 +72,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
-       struct xfs_dquot *qh_next;
-       struct xfs_dquot *qh_prev;
-       struct mutex      qh_lock;
-       uint              qh_version;
-       uint              qh_nelems;
-} xfs_frlist_t;
 /*
 * Quota Manager (global) structure. Lives only in core.
@@ -91,7 +80,9 @@ typedef struct xfs_qm {
        xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
        xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
        uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-        xfs_frlist_t     qm_dqfreelist;  /* freelist of dquots */
+        struct list_head qm_dqfrlist;    /* freelist of dquots */
+        struct mutex     qm_dqfrlist_lock;
+        int              qm_dqfrlist_cnt;
        atomic_t         qm_totaldquots; /* total incore dquots */
        uint             qm_nrefs;       /* file systems with quota on */
        int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
@@ -106,7 +97,9 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
+        struct list_head qi_dqlist;      /* all dquots in filesys */
+        struct mutex     qi_dqlist_lock;
+        int              qi_dquots;
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
        time_t           qi_btimelimit;  /* limit for blks timer */
@@ -175,10 +168,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* list stuff */
-extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
 #ifdef DEBUG
 extern int              xfs_qm_internalqcheck(xfs_mount_t *);
 #else
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 83e7ea3e25fa..3d1fc79532e2 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v)
                        ndquot,
                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+                        xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
        return 0;
 }
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 50bee07d6b0e..92b002f1805f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff(
        xfs_mount_t             *mp,
        uint                    flags)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        uint                    dqtype;
        int                     error;
        uint                    inactivate_flags;
@@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff(
         * critical thing.
         * If quotaoff, then we must be dealing with the root filesystem.
         */
-        ASSERT(mp->m_quotainfo);
+        ASSERT(q);
-        if (mp->m_quotainfo)
+        mutex_lock(&q->qi_quotaofflock);
-                mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-        ASSERT(mp->m_quotainfo);
        /*
         * If we're just turning off quota enforcement, change mp and go.
@@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff(
                spin_lock(&mp->m_sb_lock);
                mp->m_sb.sb_qflags = mp->m_qflags;
                spin_unlock(&mp->m_sb_lock);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
@@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff(
         * Nothing to do?  Don't complain. This happens when we're just
         * turning off quota enforcement.
         */
-        if ((mp->m_qflags & flags) == 0) {
+        if ((mp->m_qflags & flags) == 0)
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                goto out_unlock;
-                return (0);
-        }
        /*
         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
@@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff(
         */
        error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
        if (error)
-                goto out_error;
+                goto out_unlock;
        /*
         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff(
         * So, if we couldn't purge all the dquots from the filesystem,
         * we can't get rid of the incore data structures.
         */
-        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
                delay(10 * nculprits);
        /*
@@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff(
        if (error) {
                /* We're screwed now. Shutdown is the only option. */
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                goto out_error;
+                goto out_unlock;
        }
        /*
@@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff(
         */
        if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
            ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
                return (0);
        }
        /*
-         * Release our quotainode references, and vn_purge them,
+         * Release our quotainode references if we don't need them anymore.
-         * if we don't need them anymore.
         */
-        if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+        if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
-                IRELE(XFS_QI_UQIP(mp));
+                IRELE(q->qi_uquotaip);
-                XFS_QI_UQIP(mp) = NULL;
+                q->qi_uquotaip = NULL;
        }
-        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
+        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
-                IRELE(XFS_QI_GQIP(mp));
+                IRELE(q->qi_gquotaip);
-                XFS_QI_GQIP(mp) = NULL;
+                q->qi_gquotaip = NULL;
        }
-out_error:
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-        return (error);
+out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
+        return error;
 }
 int
@@ -379,9 +374,9 @@ xfs_qm_scall_quotaon(
        /*
         * Switch on quota enforcement in core.
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
        return (0);
 }
@@ -392,11 +387,12 @@ xfs_qm_scall_quotaon(
 */
 int
 xfs_qm_scall_getqstat(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        fs_quota_stat_t *out)
+        struct fs_quota_stat    *out)
 {
-        xfs_inode_t     *uip, *gip;
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
-        boolean_t       tempuqip, tempgqip;
+        struct xfs_inode        *uip, *gip;
+        boolean_t               tempuqip, tempgqip;
        uip = gip = NULL;
        tempuqip = tempgqip = B_FALSE;
@@ -415,9 +411,9 @@ xfs_qm_scall_getqstat(
        out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-        if (mp->m_quotainfo) {
+        if (q) {
-                uip = mp->m_quotainfo->qi_uquotaip;
+                uip = q->qi_uquotaip;
-                gip = mp->m_quotainfo->qi_gquotaip;
+                gip = q->qi_gquotaip;
        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -441,17 +437,20 @@ xfs_qm_scall_getqstat(
                if (tempgqip)
                        IRELE(gip);
        }
-        if (mp->m_quotainfo) {
+        if (q) {
-                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+                out->qs_incoredqs = q->qi_dquots;
-                out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+                out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+                out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+                out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+                out->qs_iwarnlimit = q->qi_iwarnlimit;
        }
-        return (0);
+        return 0;
 }
+#define XFS_DQ_MASK \
+        (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
 /*
 * Adjust quota limits, and start/stop timers accordingly.
 */
@@ -462,15 +461,17 @@ xfs_qm_scall_setqlim(
        uint                    type,
        fs_disk_quota_t         *newlim)
 {
+        struct xfs_quotainfo    *q = mp->m_quotainfo;
        xfs_disk_dquot_t        *ddq;
        xfs_dquot_t             *dqp;
        xfs_trans_t             *tp;
        int                     error;
        xfs_qcnt_t              hard, soft;
-        if ((newlim->d_fieldmask &
+        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-            (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
+                return EINVAL;
-                return (0);
+        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+                return 0;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
@@ -485,7 +486,7 @@ xfs_qm_scall_setqlim(
         * a quotaoff from happening). (XXXThis doesn't currently happen
         * because we take the vfslock before calling xfs_qm_sysent).
         */
-        mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
+        mutex_lock(&q->qi_quotaofflock);
        /*
         * Get the dquot (locked), and join it to the transaction.
@@ -493,9 +494,8 @@ xfs_qm_scall_setqlim(
         */
        if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
                ASSERT(error != ENOENT);
-                return (error);
+                goto out_unlock;
        }
        xfs_trans_dqjoin(tp, dqp);
        ddq = &dqp->q_core;
@@ -513,8 +513,8 @@ xfs_qm_scall_setqlim(
                ddq->d_blk_hardlimit = cpu_to_be64(hard);
                ddq->d_blk_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_bhardlimit = hard;
+                        q->qi_bhardlimit = hard;
-                        mp->m_quotainfo->qi_bsoftlimit = soft;
+                        q->qi_bsoftlimit = soft;
                }
        } else {
                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
@@ -529,8 +529,8 @@ xfs_qm_scall_setqlim(
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
                ddq->d_rtb_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_rtbhardlimit = hard;
+                        q->qi_rtbhardlimit = hard;
-                        mp->m_quotainfo->qi_rtbsoftlimit = soft;
+                        q->qi_rtbsoftlimit = soft;
                }
        } else {
                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
@@ -546,8 +546,8 @@ xfs_qm_scall_setqlim(
                ddq->d_ino_hardlimit = cpu_to_be64(hard);
                ddq->d_ino_softlimit = cpu_to_be64(soft);
                if (id == 0) {
-                        mp->m_quotainfo->qi_ihardlimit = hard;
+                        q->qi_ihardlimit = hard;
-                        mp->m_quotainfo->qi_isoftlimit = soft;
+                        q->qi_isoftlimit = soft;
                }
        } else {
                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
@@ -572,23 +572,23 @@ xfs_qm_scall_setqlim(
                 * for warnings.
                 */
                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-                        mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+                        q->qi_btimelimit = newlim->d_btimer;
                        ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
                }
                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-                        mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+                        q->qi_itimelimit = newlim->d_itimer;
                        ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
                }
                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-                        mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        q->qi_rtbtimelimit = newlim->d_rtbtimer;
                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
                }
                if (newlim->d_fieldmask & FS_DQ_BWARNS)
-                        mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
+                        q->qi_bwarnlimit = newlim->d_bwarns;
                if (newlim->d_fieldmask & FS_DQ_IWARNS)
-                        mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
+                        q->qi_iwarnlimit = newlim->d_iwarns;
                if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-                        mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
+                        q->qi_rtbwarnlimit = newlim->d_rtbwarns;
        } else {
                /*
                 * If the user is now over quota, start the timelimit.
@@ -605,8 +605,9 @@ xfs_qm_scall_setqlim(
        error = xfs_trans_commit(tp, 0);
        xfs_qm_dqprint(dqp);
        xfs_qm_dqrele(dqp);
-        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+ out_unlock:
+        mutex_unlock(&q->qi_quotaofflock);
        return error;
 }
@@ -853,7 +854,8 @@ xfs_dqrele_inode(
        int                     error;
        /* skip quota inodes */
-        if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
+        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+            ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
                ASSERT(ip->i_udquot == NULL);
                ASSERT(ip->i_gdquot == NULL);
                read_unlock(&pag->pag_ici_lock);
@@ -931,7 +933,8 @@ struct mutex  qcheck_lock;
 }
 typedef struct dqtest {
-        xfs_dqmarker_t  q_lists;
+        uint             dq_flags;      /* various flags (XFS_DQ_*) */
+        struct list_head q_hashlist;
        xfs_dqhash_t    *q_hash;        /* the hashchain header */
        xfs_mount_t     *q_mount;       /* filesystem this relates to */
        xfs_dqid_t      d_id;           /* user id or group id */
@@ -942,14 +945,9 @@ typedef struct dqtest {
 STATIC void
 xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 {
-        xfs_dquot_t *d;
+        list_add(&dqp->q_hashlist, &h->qh_list);
-        if (((d) = (h)->qh_next))
+        h->qh_version++;
-                (d)->HL_PREVP = &((dqp)->HL_NEXT);
+        h->qh_nelems++;
-        (dqp)->HL_NEXT = d;
-        (dqp)->HL_PREVP = &((h)->qh_next);
-        (h)->qh_next = (xfs_dquot_t *)dqp;
-        (h)->qh_version++;
-        (h)->qh_nelems++;
 }
 STATIC void
 xfs_qm_dqtest_print(
@@ -1061,9 +1059,7 @@ xfs_qm_internalqcheck_dqget(
        xfs_dqhash_t    *h;
        h = DQTEST_HASH(mp, id, type);
-        for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+        list_for_each_entry(d, &h->qh_list, q_hashlist) {
-             d = (xfs_dqtest_t *) d->HL_NEXT) {
-                /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
                if (d->d_id == id && mp == d->q_mount) {
                        *O_dq = d;
                        return (0);
@@ -1074,6 +1070,7 @@ xfs_qm_internalqcheck_dqget(
        d->d_id = id;
        d->q_mount = mp;
        d->q_hash = h;
+        INIT_LIST_HEAD(&d->q_hashlist);
        xfs_qm_hashinsert(h, d);
        *O_dq = d;
        return (0);
@@ -1180,8 +1177,6 @@ xfs_qm_internalqcheck(
        xfs_ino_t       lastino;
        int             done, count;
        int             i;
-        xfs_dqtest_t    *d, *e;
-        xfs_dqhash_t    *h1;
        int             error;
        lastino = 0;
@@ -1221,19 +1216,18 @@ xfs_qm_internalqcheck(
        }
        cmn_err(CE_DEBUG, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
-                h1 = &qmtest_udqtab[i];
+                xfs_dqtest_t    *d, *n;
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                xfs_dqhash_t    *h;
+                h = &qmtest_udqtab[i];
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
-                h1 = &qmtest_gdqtab[i];
+                h = &qmtest_gdqtab[i];
-                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) {
                        xfs_dqtest_cmp(d);
-                        e = (xfs_dqtest_t *) d->HL_NEXT;
                        kmem_free(d);
-                        d = e;
                }
        }
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 8286b2842b6b..94a3d927d716 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -24,43 +24,6 @@
 */
 #define XFS_DQITER_MAP_SIZE     10
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp)     ((mp)->m_quotainfo->qi_dqperchunk)
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d)     ((d)->q_transp == (t))
-#define XFS_QI_MPLRECLAIMS(mp)  ((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp)         ((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp)         ((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp)   ((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp)   ((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp)   ((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp)     ((mp)->m_quotainfo->qi_quotaofflock)
-#define XFS_QI_MPL_LIST(mp)     ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLNEXT(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp)   ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-#define xfs_qm_mplist_lock(mp) \
-        mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_nowait(mp) \
-        mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_mplist_unlock(mp) \
-        mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define XFS_QM_IS_MPLIST_LOCKED(mp) \
-        mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
-#define xfs_qm_freelist_lock(qm) \
-        mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_lock_nowait(qm) \
-        mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
-#define xfs_qm_freelist_unlock(qm) \
-        mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
 /*
 * Hash into a bucket in the dquot hash table, based on <mp, id>.
 */
@@ -72,9 +35,6 @@
                                      XFS_DQ_HASHVAL(mp, id)) : \
                                     (xfs_Gqm->qm_grp_dqhtable + \
                                      XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
-                                        XFS_IS_UQUOTA_ON(mp) : \
-                                        XFS_IS_OQUOTA_ON(mp))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
@@ -86,68 +46,6 @@
        !dqp->q_core.d_rtbcount && \
        !dqp->q_core.d_icount)
-#define HL_PREVP        dq_hashlist.ql_prevp
-#define HL_NEXT         dq_hashlist.ql_next
-#define MPL_PREVP       dq_mplist.ql_prevp
-#define MPL_NEXT        dq_mplist.ql_next
-#define _LIST_REMOVE(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (dqp)->NXT))                                \
-                         (d)->PVP = (dqp)->PVP;                 \
-                 *((dqp)->PVP) = d;                             \
-                 (dqp)->NXT = NULL;                             \
-                 (dqp)->PVP = NULL;                             \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems--;                              \
-        }
-#define _LIST_INSERT(h, dqp, PVP, NXT)                          \
-        {                                                       \
-                 xfs_dquot_t *d;                                \
-                 if (((d) = (h)->qh_next))                      \
-                         (d)->PVP = &((dqp)->NXT);              \
-                 (dqp)->NXT = d;                                \
-                 (dqp)->PVP = &((h)->qh_next);                  \
-                 (h)->qh_next = dqp;                            \
-                 (h)->qh_version++;                             \
-                 (h)->qh_nelems++;                              \
-         }
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
-        for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)   \
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
-     (dqp) = (dqp)->dq_flnext)
-#define XQM_HASHLIST_INSERT(h, dqp)     \
-         _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_INSERT(h, dqp)     \
-         xfs_qm_freelist_append(h, dqp)
-#define XQM_MPLIST_INSERT(h, dqp)       \
-         _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-#define XQM_HASHLIST_REMOVE(h, dqp)     \
-         _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp)        \
-         xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp)       \
-        { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
-          XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-#define XFS_DQ_IS_LOGITEM_INITD(dqp)    ((dqp)->q_logitem.qli_dquot == (dqp))
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp)   (XFS_QM_ISUDQ(dqp) ? \
-                                         (tp)->t_dqinfo->dqa_usrdquots : \
-                                         (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp)         \
-        (!((dqp)->q_core.d_id))
 #define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index c3ab75cb1d9a..061d827da33c 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -59,12 +59,11 @@ xfs_trans_dqjoin(
        xfs_trans_t     *tp,
        xfs_dquot_t     *dqp)
 {
-        xfs_dq_logitem_t    *lp;
+        xfs_dq_logitem_t    *lp = &dqp->q_logitem;
-        ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(dqp->q_transp != tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+        ASSERT(lp->qli_dquot == dqp);
-        lp = &dqp->q_logitem;
        /*
         * Get a log_item_desc to point at the new item.
@@ -96,7 +95,7 @@ xfs_trans_log_dquot(
 {
        xfs_log_item_desc_t     *lidp;
-        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(dqp->q_transp == tp);
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
@@ -198,16 +197,16 @@ xfs_trans_get_dqtrx(
        int             i;
        xfs_dqtrx_t     *qa;
-        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+        qa = XFS_QM_ISUDQ(dqp) ?
-                qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+                tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
                if (qa[i].qt_dquot == NULL ||
-                    qa[i].qt_dquot == dqp) {
+                    qa[i].qt_dquot == dqp)
-                        return (&qa[i]);
+                        return &qa[i];
-                }
        }
-        return (NULL);
+        return NULL;
 }
 /*
@@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas(
                                break;
                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-                        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+                        ASSERT(dqp->q_transp == tp);
                        /*
                         * adjust the actual number of blocks used
@@ -639,7 +638,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_bsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_btimer);
                warns = be16_to_cpu(dqp->q_core.d_bwarns);
-                warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
                resbcountp = &dqp->q_res_bcount;
        } else {
                ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
@@ -651,7 +650,7 @@ xfs_trans_dqresv(
                        softlimit = q->qi_rtbsoftlimit;
                timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
                warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-                warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
+                warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
                resbcountp = &dqp->q_res_rtbcount;
        }
@@ -691,7 +690,7 @@ xfs_trans_dqresv(
                        count = be64_to_cpu(dqp->q_core.d_icount);
                        timer = be32_to_cpu(dqp->q_core.d_itimer);
                        warns = be16_to_cpu(dqp->q_core.d_iwarns);
-                        warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount);
+                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
                        hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
                        if (!hardlimit)
                                hardlimit = q->qi_ihardlimit;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern struct xattr_handler xfs_xattr_acl_default_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 # define xfs_check_acl                                  NULL
 # define xfs_get_acl(inode, type)                       NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
 } xfs_agfl_t;
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
- * but whose transactions aren't committed to disk yet.
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
 */
-typedef struct xfs_perag_busy {
+struct xfs_busy_extent {
-        xfs_agblock_t   busy_start;
+        struct rb_node  rb_node;        /* ag by-bno indexed search tree */
-        xfs_extlen_t    busy_length;
+        struct list_head list;          /* transaction busy extent list */
-        struct xfs_trans *busy_tp;      /* transaction that did the free */
+        xfs_agnumber_t  agno;
-} xfs_perag_busy_t;
+        xfs_agblock_t   bno;
+        xfs_extlen_t    length;
+        xlog_tid_t      tid;            /* transaction that created this */
+};
 /*
 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
        xfs_agino_t     pagl_leftrec;
        xfs_agino_t     pagl_rightrec;
 #ifdef __KERNEL__
-        spinlock_t      pagb_lock;      /* lock for pagb_list */
+        spinlock_t      pagb_lock;      /* lock for pagb_tree */
+        struct rb_root  pagb_tree;      /* ordered tree of busy extents */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
        int             pag_ici_reclaimable;    /* reclaimable inodes */
 #endif
        int             pagb_count;     /* pagb slots in use */
-        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
 } xfs_perag_t;
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-STATIC void
+static int
-xfs_alloc_search_busy(xfs_trans_t *tp,
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno, xfs_extlen_t len);
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
                                be32_to_cpu(agf->agf_length));
                        xfs_alloc_log_agf(args->tp, args->agbp,
                                                XFS_AGF_FREEBLKS);
-                        /* search the busylist for these blocks */
+                        /*
-                        xfs_alloc_search_busy(args->tp, args->agno,
+                         * Search the busylist for these blocks and mark the
-                                        args->agbno, args->len);
+                         * transaction as synchronous if blocks are found. This
+                         * avoids the need to block due to a synchronous log
+                         * force to ensure correct ordering as the synchronous
+                         * transaction will guarantee that for us.
+                         */
+                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                                                args->agbno, args->len))
+                                xfs_trans_set_sync(args->tp);
                }
                if (!args->isfl)
                        xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
         * when the iclog commits to disk.  If a busy block is allocated,
         * the iclog is pushed up to the LSN that freed the block.
         */
-        xfs_alloc_mark_busy(tp, agno, bno, len);
+        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
        *bnop = bno;
        /*
-         * As blocks are freed, they are added to the per-ag busy list
+         * As blocks are freed, they are added to the per-ag busy list and
-         * and remain there until the freeing transaction is committed to
+         * remain there until the freeing transaction is committed to disk.
-         * disk.  Now that we have allocated blocks, this list must be
+         * Now that we have allocated blocks, this list must be searched to see
-         * searched to see if a block is being reused.  If one is, then
+         * if a block is being reused.  If one is, then the freeing transaction
-         * the freeing transaction must be pushed to disk NOW by forcing
+         * must be pushed to disk before this transaction.
-         * to disk all iclogs up that transaction's LSN.
+         *
+         * We do this by setting the current transaction to a sync transaction
+         * which guarantees that the freeing transaction is on disk before this
+         * transaction. This is done instead of a synchronous log force here so
+         * that we don't sit and wait with the AGF locked in the transaction
+         * during the log force.
         */
-        xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
                spin_lock_init(&pag->pagb_lock);
                pag->pagb_count = 0;
-                memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+                pag->pagb_tree = RB_ROOT;
                pag->pagf_init = 1;
        }
 #ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
 * list is reused, the transaction that freed it must be forced to disk
 * before continuing to use the block.
 *
- * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing.  When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log.  Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ *      Thread 1        Thread 2        Thread 3        xfslogd
+ *      xact alloc
+ *      free X
+ *      mark busy
+ *      commit xact
+ *      free xact
+ *                      xact alloc
+ *                      alloc X
+ *                      busy search
+ *                      mark xact sync
+ *                      commit xact
+ *                      free xact
+ *                      force log
+ *                      checkpoint starts
+ *                      ....
+ *                                      xact alloc
+ *                                      free X
+ *                                      mark busy
+ *                                      finds match
+ *                                      *** KABOOM! ***
+ *                                      ....
+ *                                                      log IO completes
+ *                                                      unbusy X
+ *                      checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction.  Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
 */
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(
-                    xfs_agnumber_t agno,
+        struct xfs_trans        *tp,
-                    xfs_agblock_t bno,
+        xfs_agnumber_t          agno,
-                    xfs_extlen_t len)
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
-        xfs_perag_busy_t        *bsy;
+        struct xfs_busy_extent  *new;
+        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
-        int                     n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        int                     match;
-        pag = xfs_perag_get(tp->t_mountp, agno);
-        spin_lock(&pag->pagb_lock);
-        /* search pagb_list for an open slot */
+        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
-        for (bsy = pag->pagb_list, n = 0;
+        if (!new) {
-             n < XFS_PAGB_NUM_SLOTS;
+                /*
-             bsy++, n++) {
+                 * No Memory!  Since it is now not possible to track the free
-                if (bsy->busy_tp == NULL) {
+                 * block, make this a synchronous transaction to insure that
-                        break;
+                 * the block is not reused before this transaction commits.
-                }
+                 */
+                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                xfs_trans_set_sync(tp);
+                return;
        }
-        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+        new->agno = agno;
+        new->bno = bno;
+        new->length = len;
+        new->tid = xfs_log_get_trans_ident(tp);
-        if (n < XFS_PAGB_NUM_SLOTS) {
+        INIT_LIST_HEAD(&new->list);
-                bsy = &pag->pagb_list[n];
-                pag->pagb_count++;
+        /* trace before insert to be able to see failed inserts */
-                bsy->busy_start = bno;
+        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
-                bsy->busy_length = len;
-                bsy->busy_tp = tp;
+        pag = xfs_perag_get(tp->t_mountp, new->agno);
-                xfs_trans_add_busy(tp, agno, n);
+restart:
-        } else {
+        spin_lock(&pag->pagb_lock);
+        rbp = &pag->pagb_tree.rb_node;
+        parent = NULL;
+        busyp = NULL;
+        match = 0;
+        while (*rbp && match >= 0) {
+                parent = *rbp;
+                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+                if (new->bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        rbp = &(*rbp)->rb_left;
+                        if (new->bno + new->length > busyp->bno)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else if (new->bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        rbp = &(*rbp)->rb_right;
+                        if (bno < busyp->bno + busyp->length)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else {
+                        match = busyp->tid == new->tid ? 1 : -1;
+                        break;
+                }
+        }
+        if (match < 0) {
+                /* overlap marked busy in different transaction */
+                spin_unlock(&pag->pagb_lock);
+                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+                goto restart;
+        }
+        if (match > 0) {
                /*
-                 * The busy list is full!  Since it is now not possible to
+                 * overlap marked busy in same transaction. Update if exact
-                 * track the free block, make this a synchronous transaction
+                 * start block match, otherwise combine the busy extents into
-                 * to insure that the block is not reused before this
+                 * a single range.
-                 * transaction commits.
                 */
-                xfs_trans_set_sync(tp);
+                if (busyp->bno == new->bno) {
-        }
+                        busyp->length = max(busyp->length, new->length);
+                        spin_unlock(&pag->pagb_lock);
+                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+                        xfs_perag_put(pag);
+                        kmem_free(new);
+                        return;
+                }
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                new->length = max(busyp->bno + busyp->length,
+                                        new->bno + new->length) -
+                                min(busyp->bno, new->bno);
+                new->bno = min(busyp->bno, new->bno);
+        } else
+                busyp = NULL;
+        rb_link_node(&new->rb_node, parent, rbp);
+        rb_insert_color(&new->rb_node, &pag->pagb_tree);
+        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+        kmem_free(busyp);
 }
-void
+/*
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+ * Search for a busy extent within the range of the extent we are about to
-                     xfs_agnumber_t agno,
+ * allocate.  You need to be holding the busy extent tree lock when calling
-                     int idx)
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *list;
+        struct rb_node          *rbp;
+        struct xfs_busy_extent  *busyp;
+        int                     match = 0;
-        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+        pag = xfs_perag_get(mp, agno);
-        pag = xfs_perag_get(tp->t_mountp, agno);
        spin_lock(&pag->pagb_lock);
-        list = pag->pagb_list;
-        trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
+        rbp = pag->pagb_tree.rb_node;
-        if (list[idx].busy_tp == tp) {
+        /* find closest start bno overlap */
-                list[idx].busy_tp = NULL;
+        while (rbp) {
-                pag->pagb_count--;
+                busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                if (bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        if (bno + len > busyp->bno)
+                                match = -1;
+                        rbp = rbp->rb_left;
+                } else if (bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        if (bno < busyp->bno + busyp->length)
+                                match = -1;
+                        rbp = rbp->rb_right;
+                } else {
+                        /* bno matches busyp, length determines exact match */
+                        match = (busyp->length == len) ? 1 : -1;
+                        break;
+                }
        }
        spin_unlock(&pag->pagb_lock);
+        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
+        return match;
 }
+void
-/*
+xfs_alloc_busy_clear(
- * If we find the extent in the busy list, force the log out to get the
+        struct xfs_mount        *mp,
- * extent out of the busy list so the caller can use it straight away.
+        struct xfs_busy_extent  *busyp)
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-                    xfs_agnumber_t agno,
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *bsy;
-        xfs_agblock_t           uend, bend;
-        xfs_lsn_t               lsn = 0;
-        int                     cnt;
-        pag = xfs_perag_get(tp->t_mountp, agno);
+        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
-        spin_lock(&pag->pagb_lock);
+                                                busyp->length);
-        cnt = pag->pagb_count;
-        /*
+        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
-         * search pagb_list for this slot, skipping open slots. We have to
+                                                busyp->length) == 1);
-         * search the entire array as there may be multiple overlaps and
-         * we have to get the most recent LSN for the log force to push out
-         * all the transactions that span the range.
-         */
-        uend = bno + len - 1;
-        for (cnt = 0; cnt < pag->pagb_count; cnt++) {
-                bsy = &pag->pagb_list[cnt];
-                if (!bsy->busy_tp)
-                        continue;
-                bend = bsy->busy_start + bsy->busy_length - 1;
+        list_del_init(&busyp->list);
-                if (bno > bend || uend < bsy->busy_start)
-                        continue;
-                /* (start1,length1) within (start2, length2) */
+        pag = xfs_perag_get(mp, busyp->agno);
-                if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+        spin_lock(&pag->pagb_lock);
-                        lsn = bsy->busy_tp->t_commit_lsn;
+        rb_erase(&busyp->rb_node, &pag->pagb_tree);
-        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
-        /*
+        kmem_free(busyp);
-         * If a block was found, force the log through the LSN of the
-         * transaction that freed the block
-         */
-        if (lsn)
-                xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
+struct xfs_busy_extent;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
                xfs_agnumber_t agno,
                xfs_agblock_t bno,
                xfs_extlen_t len);
 void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
-                xfs_agnumber_t ag,
-                int idx);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
         * disk. If a busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
-        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5c11e4d17010..99587ded043f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork(
        }
        if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
                goto error2;
-        error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES);
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        ASSERT(ip->i_df.if_ext_max ==
               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f3c49e69eab9..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
        nbytes = last - first + 1;
        bfset(bip->bli_logged, first, nbytes);
        for (x = 0; x < nbytes; x++) {
-                chunk_num = byte >> XFS_BLI_SHIFT;
+                chunk_num = byte >> XFS_BLF_SHIFT;
                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
                bit_num = chunk_num & (NBWORD - 1);
                wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_size_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                return 1;
        }
@@ -197,9 +197,9 @@ xfs_buf_item_size(
                } else if (next_bit != last_bit + 1) {
                        last_bit = next_bit;
                        nvecs++;
-                } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+                } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
-                           (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+                           (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
                        last_bit = next_bit;
                        nvecs++;
                } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
        vecp++;
        nvecs = 1;
+        /*
+         * If it is an inode buffer, transfer the in-memory state to the
+         * format flags and clear the in-memory state. We do not transfer
+         * this state if the inode buffer allocation has not yet been committed
+         * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+         * correct replay of the inode allocation.
+         */
+        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+                      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+                        bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+        }
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
                 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_format_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                bip->bli_format.blf_size = nvecs;
                return;
        }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
                 * keep counting and scanning.
                 */
                if (next_bit == -1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        break;
                } else if (next_bit != last_bit + 1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
-                } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+                } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
-                           (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+                           (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
 }
 /*
- * This is called to pin the buffer associated with the buf log
+ * This is called to pin the buffer associated with the buf log item in memory
- * item in memory so it cannot be written out.  Simply call bpin()
+ * so it cannot be written out.  Simply call bpin() on the buffer to do this.
- * on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
 */
 STATIC void
 xfs_buf_item_pin(
        xfs_buf_log_item_t      *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
+        atomic_inc(&bip->bli_refcount);
        trace_xfs_buf_item_pin(bip);
        xfs_bpin(bp);
 }
@@ -372,12 +392,12 @@ xfs_buf_item_pin(
 */
 STATIC void
 xfs_buf_item_unpin(
-        xfs_buf_log_item_t      *bip,
+        xfs_buf_log_item_t      *bip)
-        int                     stale)
 {
        struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
+        int             stale = bip->bli_flags & XFS_BLI_STALE;
        bp = bip->bli_buf;
        ASSERT(bp != NULL);
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
                /*
@@ -428,40 +448,34 @@ xfs_buf_item_unpin_remove(
        xfs_buf_log_item_t      *bip,
        xfs_trans_t             *tp)
 {
-        xfs_buf_t               *bp;
+        /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */
-        xfs_log_item_desc_t     *lidp;
-        int                     stale = 0;
-        bp = bip->bli_buf;
-        /*
-         * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
-         */
        if ((atomic_read(&bip->bli_refcount) == 1) &&
            (bip->bli_flags & XFS_BLI_STALE)) {
+                /*
+                 * yes -- We can safely do some work here and then call
+                 * buf_item_unpin to do the rest because we are
+                 * are holding the buffer locked so no one else will be
+                 * able to bump up the refcount. We have to remove the
+                 * log item from the transaction as we are about to release
+                 * our reference to the buffer. If we don't, the unlock that
+                 * occurs later in the xfs_trans_uncommit() will try to
+                 * reference the buffer which we no longer have a hold on.
+                 */
+                struct xfs_log_item_desc *lidp;
                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
                trace_xfs_buf_item_unpin_stale(bip);
-                /*
+                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip);
-                 * yes -- clear the xaction descriptor in-use flag
-                 * and free the chunk if required.  We can safely
-                 * do some work here and then call buf_item_unpin
-                 * to do the rest because if the if is true, then
-                 * we are holding the buffer locked so no one else
-                 * will be able to bump up the refcount.
-                 */
-                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
-                stale = lidp->lid_flags & XFS_LID_BUF_STALE;
                xfs_trans_free_item(tp, lidp);
                /*
-                 * Since the transaction no longer refers to the buffer,
+                 * Since the transaction no longer refers to the buffer, the
-                 * the buffer should no longer refer to the transaction.
+                 * buffer should no longer refer to the transaction.
                 */
-                XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+                XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL);
        }
+        xfs_buf_item_unpin(bip);
-        xfs_buf_item_unpin(bip, stale);
-        return;
 }
 /*
@@ -495,20 +509,23 @@ xfs_buf_item_trylock(
 }
 /*
- * Release the buffer associated with the buf log item.
+ * Release the buffer associated with the buf log item.  If there is no dirty
- * If there is no dirty logged data associated with the
+ * logged data associated with the buffer recorded in the buf log item, then
- * buffer recorded in the buf log item, then free the
+ * free the buf log item and remove the reference to it in the buffer.
- * buf log item and remove the reference to it in the
+ *
- * buffer.
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
 *
- * This call ignores the recursion count.  It is only called
+ * We unconditionally drop the transaction's reference to the log item. If the
- * when the buffer should REALLY be unlocked, regardless
+ * item was logged, then another reference was taken when it was pinned, so we
- * of the recursion count.
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
 *
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
- * free the log item if necessary but do not unlock the buffer.
+ * if necessary but do not unlock the buffer.  This is for support of
- * This is for support of xfs_trans_bhold(). Make sure the
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * free the item.
 */
 STATIC void
 xfs_buf_item_unlock(
@@ -520,73 +537,54 @@ xfs_buf_item_unlock(
        bp = bip->bli_buf;
-        /*
+        /* Clear the buffer's association with this transaction. */
-         * Clear the buffer's association with this transaction.
-         */
        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
        /*
-         * If this is a transaction abort, don't return early.
+         * If this is a transaction abort, don't return early.  Instead, allow
-         * Instead, allow the brelse to happen.
+         * the brelse to happen.  Normally it would be done for stale
-         * Normally it would be done for stale (cancelled) buffers
+         * (cancelled) buffers at unpin time, but we'll never go through the
-         * at unpin time, but we'll never go through the pin/unpin
+         * pin/unpin cycle if we abort inside commit.
-         * cycle if we abort inside commit.
         */
        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
        /*
-         * If the buf item is marked stale, then don't do anything.
+         * Before possibly freeing the buf item, determine if we should
-         * We'll unlock the buffer and free the buf item when the
+         * release the buffer at the end of this routine.
-         * buffer is unpinned for the last time.
         */
-        if (bip->bli_flags & XFS_BLI_STALE) {
+        hold = bip->bli_flags & XFS_BLI_HOLD;
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
-                trace_xfs_buf_item_unlock_stale(bip);
+        /* Clear the per transaction state. */
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
-                if (!aborted)
-                        return;
-        }
        /*
-         * Drop the transaction's reference to the log item if
+         * If the buf item is marked stale, then don't do anything.  We'll
-         * it was not logged as part of the transaction.  Otherwise
+         * unlock the buffer and free the buf item when the buffer is unpinned
-         * we'll drop the reference in xfs_buf_item_unpin() when
+         * for the last time.
-         * the transaction is really through with the buffer.
         */
-        if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
+        if (bip->bli_flags & XFS_BLI_STALE) {
-                atomic_dec(&bip->bli_refcount);
+                trace_xfs_buf_item_unlock_stale(bip);
-        } else {
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-                /*
+                if (!aborted) {
-                 * Clear the logged flag since this is per
+                        atomic_dec(&bip->bli_refcount);
-                 * transaction state.
+                        return;
-                 */
+                }
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
        }
-        /*
-         * Before possibly freeing the buf item, determine if we should
-         * release the buffer at the end of this routine.
-         */
-        hold = bip->bli_flags & XFS_BLI_HOLD;
        trace_xfs_buf_item_unlock(bip);
        /*
-         * If the buf item isn't tracking any data, free it.
+         * If the buf item isn't tracking any data, free it, otherwise drop the
-         * Otherwise, if XFS_BLI_HOLD is set clear it.
+         * reference we hold to it.
         */
        if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
-                             bip->bli_format.blf_map_size)) {
+                             bip->bli_format.blf_map_size))
                xfs_buf_item_relse(bp);
-        } else if (hold) {
+        else
-                bip->bli_flags &= ~XFS_BLI_HOLD;
+                atomic_dec(&bip->bli_refcount);
-        }
-        /*
+        if (!hold)
-         * Release the buffer if XFS_BLI_HOLD was not set.
-         */
-        if (!hold) {
                xfs_buf_relse(bp);
-        }
 }
 /*
@@ -675,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_buf_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
                                        xfs_buf_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
@@ -723,20 +721,17 @@ xfs_buf_item_init(
        }
        /*
-         * chunks is the number of XFS_BLI_CHUNK size pieces
+         * chunks is the number of XFS_BLF_CHUNK size pieces
         * the buffer can be divided into. Make sure not to
         * truncate any pieces.  map_size is the size of the
         * bitmap needed to describe the chunks of the buffer.
         */
-        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
        map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
                                                    KM_SLEEP);
-        bip->bli_item.li_type = XFS_LI_BUF;
+        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
-        bip->bli_item.li_ops = &xfs_buf_item_ops;
-        bip->bli_item.li_mountp = mp;
-        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -799,8 +794,8 @@ xfs_buf_item_log(
        /*
         * Convert byte offsets to bit numbers.
         */
-        first_bit = first >> XFS_BLI_SHIFT;
+        first_bit = first >> XFS_BLF_SHIFT;
-        last_bit = last >> XFS_BLI_SHIFT;
+        last_bit = last >> XFS_BLF_SHIFT;
        /*
         * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 217f34af00cb..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -26,7 +26,7 @@ extern kmem_zone_t	*xfs_buf_item_zone;
 * have been logged.
 * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
 */
-typedef struct xfs_buf_log_format_t {
+typedef struct xfs_buf_log_format {
        unsigned short  blf_type;       /* buf log item type indicator */
        unsigned short  blf_size;       /* size of this item */
        ushort          blf_flags;      /* misc state */
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t {
 * This flag indicates that the buffer contains on disk inodes
 * and requires special recovery handling.
 */
-#define XFS_BLI_INODE_BUF       0x1
+#define XFS_BLF_INODE_BUF       0x1
 /*
 * This flag indicates that the buffer should not be replayed
 * during recovery because its blocks are being freed.
 */
-#define XFS_BLI_CANCEL          0x2
+#define XFS_BLF_CANCEL          0x2
 /*
 * This flag indicates that the buffer contains on disk
 * user or group dquots and may require special recovery handling.
 */
-#define XFS_BLI_UDQUOT_BUF      0x4
+#define XFS_BLF_UDQUOT_BUF      0x4
-#define XFS_BLI_PDQUOT_BUF      0x8
+#define XFS_BLF_PDQUOT_BUF      0x8
-#define XFS_BLI_GDQUOT_BUF      0x10
+#define XFS_BLF_GDQUOT_BUF      0x10
-#define XFS_BLI_CHUNK           128
+#define XFS_BLF_CHUNK           128
-#define XFS_BLI_SHIFT           7
+#define XFS_BLF_SHIFT           7
 #define BIT_TO_WORD_SHIFT       5
 #define NBWORD                  (NBBY * sizeof(unsigned int))
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t {
 #define XFS_BLI_LOGGED          0x08
 #define XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE     0x20
+#define XFS_BLI_INODE_BUF       0x40
 #define XFS_BLI_FLAGS \
        { XFS_BLI_HOLD,         "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t {
        { XFS_BLI_STALE,        "STALE" }, \
        { XFS_BLI_LOGGED,       "LOGGED" }, \
        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
-        { XFS_BLI_STALE_INODE,  "STALE_INODE" }
+        { XFS_BLI_STALE_INODE,  "STALE_INODE" }, \
+        { XFS_BLI_INODE_BUF,    "INODE_BUF" }
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 92d5cd5bf4f2..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
        va_list ap;
 #ifdef DEBUG
-        xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
 #endif
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
@@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
 void
 xfs_error_report(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
                            CE_ALERT, mp,
                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
-                            tag, linenum, fname, ra);
+                            tag, linenum, filename, ra);
                xfs_stack_trace();
        }
@@ -205,15 +205,15 @@ xfs_error_report(
 void
 xfs_corruption_error(
-        char            *tag,
+        const char              *tag,
-        int             level,
+        int                     level,
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        void            *p,
+        void                    *p,
-        char            *fname,
+        const char              *filename,
-        int             linenum,
+        int                     linenum,
-        inst_t          *ra)
+        inst_t                  *ra)
 {
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
-        xfs_error_report(tag, level, mp, fname, linenum, ra);
+        xfs_error_report(tag, level, mp, filename, linenum, ra);
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0c93051c4651..c2c1a072bb82 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -29,10 +29,11 @@ extern int	xfs_error_trap(int);
 struct xfs_mount;
-extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
-                                char *fname, int linenum, inst_t *ra);
+                        const char *filename, int linenum, inst_t *ra);
-extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
+extern void xfs_corruption_error(const char *tag, int level,
-                                void *p, char *fname, int linenum, inst_t *ra);
+                        struct xfs_mount *mp, void *p, const char *filename,
+                        int linenum, inst_t *ra);
 #define XFS_ERROR_REPORT(e, lvl, mp)    \
        xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6f35ed1b39b9..409fe81585fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
 {
        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
@@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_efi_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
                                        xfs_efi_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
@@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t	*mp,
                                                             KM_SLEEP);
        }
-        efip->efi_item.li_type = XFS_LI_EFI;
+        xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
-        efip->efi_item.li_ops = &xfs_efi_item_ops;
-        efip->efi_item.li_mountp = mp;
-        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
 */
 /*ARGSUSED*/
 STATIC void
-xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
 {
        return;
 }
@@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_efd_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_efd_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
@@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t	*mp,
                                                             KM_SLEEP);
        }
-        efdp->efd_item.li_type = XFS_LI_EFD;
+        xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
-        efdp->efd_item.li_ops = &xfs_efd_item_ops;
-        efdp->efd_item.li_mountp = mp;
-        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ffd56447045..8cd6e8d8fe9c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2449,6 +2449,8 @@ xfs_iunpin_nowait(
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+        trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
        /* Give the log a push to start the unpinning I/O */
        xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7bfea8540159..cf8249a60004 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -543,6 +543,7 @@ xfs_inode_item_pin(
 {
        ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
+        trace_xfs_inode_pin(iip->ili_inode, _RET_IP_);
        atomic_inc(&iip->ili_inode->i_pincount);
 }
@@ -556,11 +557,11 @@ xfs_inode_item_pin(
 /* ARGSUSED */
 STATIC void
 xfs_inode_item_unpin(
-        xfs_inode_log_item_t    *iip,
+        xfs_inode_log_item_t    *iip)
-        int                     stale)
 {
        struct xfs_inode        *ip = iip->ili_inode;
+        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
                wake_up(&ip->i_ipin_wait);
@@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove(
        xfs_inode_log_item_t    *iip,
        xfs_trans_t             *tp)
 {
-        xfs_inode_item_unpin(iip, 0);
+        xfs_inode_item_unpin(iip);
 }
 /*
@@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = {
        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
                                        xfs_inode_item_format,
        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
-        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
                                        xfs_inode_item_unpin_remove,
        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
@@ -865,17 +866,9 @@ xfs_inode_item_init(
        ASSERT(ip->i_itemp == NULL);
        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
-        iip->ili_item.li_type = XFS_LI_INODE;
-        iip->ili_item.li_ops = &xfs_inode_item_ops;
-        iip->ili_item.li_mountp = mp;
-        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
+        xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
-        /*
+                                                &xfs_inode_item_ops);
-           We have zeroed memory. No need ...
-           iip->ili_extents_buf = NULL;
-         */
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0b65039951a0..ef14943829da 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -55,71 +55,33 @@
 #define XFS_STRAT_WRITE_IMAPS   2
 #define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
-STATIC int
+STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-xfs_imap_to_bmap(
+                                  int, struct xfs_bmbt_irec *, int *);
-        xfs_inode_t     *ip,
+STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-        xfs_off_t       offset,
+                                 struct xfs_bmbt_irec *, int *);
-        xfs_bmbt_irec_t *imap,
+STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-        xfs_iomap_t     *iomapp,
+                                struct xfs_bmbt_irec *, int *);
-        int             imaps,                  /* Number of imap entries */
-        int             iomaps,                 /* Number of iomap entries */
-        int             flags)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        int             pbm;
-        xfs_fsblock_t   start_block;
-        for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
-                iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-                iomapp->iomap_delta = offset - iomapp->iomap_offset;
-                iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
-                iomapp->iomap_flags = flags;
-                if (XFS_IS_REALTIME_INODE(ip)) {
-                        iomapp->iomap_flags |= IOMAP_REALTIME;
-                        iomapp->iomap_target = mp->m_rtdev_targp;
-                } else {
-                        iomapp->iomap_target = mp->m_ddev_targp;
-                }
-                start_block = imap->br_startblock;
-                if (start_block == HOLESTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_HOLE;
-                } else if (start_block == DELAYSTARTBLOCK) {
-                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
-                        iomapp->iomap_flags |= IOMAP_DELAY;
-                } else {
-                        iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
-                        if (ISUNWRITTEN(imap))
-                                iomapp->iomap_flags |= IOMAP_UNWRITTEN;
-                }
-                offset += iomapp->iomap_bsize - iomapp->iomap_delta;
-        }
-        return pbm;     /* Return the number filled */
-}
 int
 xfs_iomap(
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_off_t       offset,
+        xfs_off_t               offset,
-        ssize_t         count,
+        ssize_t                 count,
-        int             flags,
+        int                     flags,
-        xfs_iomap_t     *iomapp,
+        struct xfs_bmbt_irec    *imap,
-        int             *niomaps)
+        int                     *nimaps,
+        int                     *new)
 {
-        xfs_mount_t     *mp = ip->i_mount;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t   offset_fsb, end_fsb;
+        xfs_fileoff_t           offset_fsb, end_fsb;
-        int             error = 0;
+        int                     error = 0;
-        int             lockmode = 0;
+        int                     lockmode = 0;
-        xfs_bmbt_irec_t imap;
+        int                     bmapi_flags = 0;
-        int             nimaps = 1;
-        int             bmapi_flags = 0;
-        int             iomap_flags = 0;
        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+        *new = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -160,8 +122,8 @@ xfs_iomap(
        error = xfs_bmapi(NULL, ip, offset_fsb,
                        (xfs_filblks_t)(end_fsb - offset_fsb),
-                        bmapi_flags,  NULL, 0, &imap,
+                        bmapi_flags,  NULL, 0, imap,
-                        &nimaps, NULL, NULL);
+                        nimaps, NULL, NULL);
        if (error)
                goto out;
@@ -169,46 +131,41 @@ xfs_iomap(
        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
        case BMAPI_WRITE:
                /* If we found an extent, return it */
-                if (nimaps &&
+                if (*nimaps &&
-                    (imap.br_startblock != HOLESTARTBLOCK) &&
+                    (imap->br_startblock != HOLESTARTBLOCK) &&
-                    (imap.br_startblock != DELAYSTARTBLOCK)) {
+                    (imap->br_startblock != DELAYSTARTBLOCK)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
                        error = xfs_iomap_write_direct(ip, offset, count, flags,
-                                                       &imap, &nimaps, nimaps);
+                                                       imap, nimaps);
                } else {
                        error = xfs_iomap_write_delay(ip, offset, count, flags,
-                                                      &imap, &nimaps);
+                                                      imap, nimaps);
                }
                if (!error) {
-                        trace_xfs_iomap_alloc(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
                }
-                iomap_flags = IOMAP_NEW;
+                *new = 1;
                break;
        case BMAPI_ALLOCATE:
                /* If we found an extent, return it */
                xfs_iunlock(ip, lockmode);
                lockmode = 0;
-                if (nimaps && !isnullstartblock(imap.br_startblock)) {
+                if (*nimaps && !isnullstartblock(imap->br_startblock)) {
-                        trace_xfs_iomap_found(ip, offset, count, flags, &imap);
+                        trace_xfs_iomap_found(ip, offset, count, flags, imap);
                        break;
                }
                error = xfs_iomap_write_allocate(ip, offset, count,
-                                                 &imap, &nimaps);
+                                                 imap, nimaps);
                break;
        }
-        if (nimaps) {
+        ASSERT(*nimaps <= 1);
-                *niomaps = xfs_imap_to_bmap(ip, offset, &imap,
-                                            iomapp, nimaps, *niomaps, iomap_flags);
-        } else if (niomaps) {
-                *niomaps = 0;
-        }
 out:
        if (lockmode)
@@ -216,7 +173,6 @@ out:
        return XFS_ERROR(error);
 }
 STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
@@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero(
        return EFSCORRUPTED;
 }
-int
+STATIC int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
        size_t          count,
        int             flags,
        xfs_bmbt_irec_t *ret_imap,
-        int             *nmaps,
+        int             *nmaps)
-        int             found)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
@@ -330,7 +285,7 @@ xfs_iomap_write_direct(
                if (error)
                        goto error_out;
        } else {
-                if (found && (ret_imap->br_startblock == HOLESTARTBLOCK))
+                if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
                                        ret_imap->br_blockcount +
                                        ret_imap->br_startoff);
@@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate(
        return 0;
 }
-int
+STATIC int
 xfs_iomap_write_delay(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
@@ -588,7 +543,7 @@ retry:
 * We no longer bother to look at the incoming map - all we have to
 * guarantee is that whatever we allocate fills the required range.
 */
-int
+STATIC int
 xfs_iomap_write_allocate(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 174f29990991..81ac4afd45b3 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,19 +18,6 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__
-#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
-typedef enum {                          /* iomap_flags values */
-        IOMAP_READ =            0,      /* mapping for a read */
-        IOMAP_HOLE =            0x02,   /* mapping covers a hole  */
-        IOMAP_DELAY =           0x04,   /* mapping covers delalloc region  */
-        IOMAP_REALTIME =        0x10,   /* mapping on the realtime device  */
-        IOMAP_UNWRITTEN =       0x20,   /* mapping covers allocated */
-                                        /* but uninitialized file data  */
-        IOMAP_NEW =             0x40    /* just allocate */
-} iomap_flags_t;
 typedef enum {
        /* base extent manipulation calls */
        BMAPI_READ = (1 << 0),          /* read extents */
@@ -52,43 +39,11 @@ typedef enum {
        { BMAPI_MMAP,           "MMAP" }, \
        { BMAPI_TRYLOCK,        "TRYLOCK" }
-/*
- * xfs_iomap_t:  File system I/O map
- *
- * The iomap_bn field is expressed in 512-byte blocks, and is where the
- * mapping starts on disk.
- *
- * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
- * iomap_offset is the offset of the mapping in the file itself.
- * iomap_bsize is the size of the mapping,  iomap_delta is the
- * desired data's offset into the mapping, given the offset supplied
- * to the file I/O map routine.
- *
- * When a request is made to read beyond the logical end of the object,
- * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
- * to the actual amount of underlying storage that has been allocated, if any.
- */
-typedef struct xfs_iomap {
-        xfs_daddr_t             iomap_bn;       /* first 512B blk of mapping */
-        xfs_buftarg_t           *iomap_target;
-        xfs_off_t               iomap_offset;   /* offset of mapping, bytes */
-        xfs_off_t               iomap_bsize;    /* size of mapping, bytes */
-        xfs_off_t               iomap_delta;    /* offset into mapping, bytes */
-        iomap_flags_t           iomap_flags;
-} xfs_iomap_t;
 struct xfs_inode;
 struct xfs_bmbt_irec;
 extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int,
-                     struct xfs_iomap *, int *);
+                     struct xfs_bmbt_irec *, int *, int *);
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
-                                  int, struct xfs_bmbt_irec *, int *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
-                                 struct xfs_bmbt_irec *, int *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
-                                struct xfs_bmbt_irec *, int *);
 extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2be019136287..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -44,13 +44,8 @@
 kmem_zone_t     *xfs_log_ticket_zone;
-#define xlog_write_adv_cnt(ptr, len, off, bytes) \
-        { (ptr) += (bytes); \
-          (len) -= (bytes); \
-          (off) += (bytes);}
 /* Local miscellaneous function prototypes */
-STATIC int       xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+STATIC int       xlog_commit_record(struct log *log, struct xlog_ticket *ticket,
                                    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
                                xfs_buftarg_t   *log_target,
@@ -59,11 +54,6 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
-STATIC int       xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
-                            int nentries, struct xlog_ticket *tic,
-                            xfs_lsn_t *start_lsn,
-                            xlog_in_core_t **commit_iclog,
-                            uint flags);
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -93,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t		*log,
 STATIC void xlog_ungrant_log_space(xlog_t        *log,
                                   xlog_ticket_t *ticket);
-/* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
-                                         int    unit_bytes,
-                                         int    count,
-                                         char   clientid,
-                                         uint   flags);
 #if defined(DEBUG)
-STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
 STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
                                  int count, boolean_t syncing);
@@ -258,7 +240,7 @@ xfs_log_done(
             * If we get an error, just continue and give back the log ticket.
             */
            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
-             (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
+             (xlog_commit_record(log, ticket, iclog, &lsn)))) {
                lsn = (xfs_lsn_t) -1;
                if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
                        flags |= XFS_LOG_REL_PERM_RESERV;
@@ -367,6 +349,15 @@ xfs_log_reserve(
                ASSERT(flags & XFS_LOG_PERM_RESERV);
                internal_ticket = *ticket;
+                /*
+                 * this is a new transaction on the ticket, so we need to
+                 * change the transaction ID so that the next transaction has a
+                 * different TID in the log. Just add one to the existing tid
+                 * so that we can see chains of rolling transactions in the log
+                 * easily.
+                 */
+                internal_ticket->t_tid++;
                trace_xfs_log_reserve(log, internal_ticket);
                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -374,7 +365,8 @@ xfs_log_reserve(
        } else {
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-                                                  client, flags);
+                                                  client, flags,
+                                                  KM_SLEEP|KM_MAYFAIL);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
@@ -459,6 +451,13 @@ xfs_log_mount(
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+        /*
+         * Now the log has been fully initialised and we know were our
+         * space grant counters are, we can initialise the permanent ticket
+         * needed for delayed logging to work.
+         */
+        xlog_cil_init_post_recovery(mp->m_log);
        return 0;
 out_destroy_ail:
@@ -516,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 #ifdef DEBUG
        xlog_in_core_t   *first_iclog;
 #endif
-        xfs_log_iovec_t  reg[1];
        xlog_ticket_t   *tic = NULL;
        xfs_lsn_t        lsn;
        int              error;
-        /* the data section must be 32 bit size aligned */
-        struct {
-            __uint16_t magic;
-            __uint16_t pad1;
-            __uint32_t pad2; /* may as well make it 64 bits */
-        } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
        /*
         * Don't write out unmount record on read-only mounts.
         * Or, if we are doing a forced umount (typically because of IO errors).
@@ -549,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        } while (iclog != first_iclog);
 #endif
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
-                reg[0].i_addr = (void*)&magic;
-                reg[0].i_len  = sizeof(magic);
-                reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
                error = xfs_log_reserve(mp, 600, 1, &tic,
                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
                if (!error) {
+                        /* the data section must be 32 bit size aligned */
+                        struct {
+                            __uint16_t magic;
+                            __uint16_t pad1;
+                            __uint32_t pad2; /* may as well make it 64 bits */
+                        } magic = {
+                                .magic = XLOG_UNMOUNT_TYPE,
+                        };
+                        struct xfs_log_iovec reg = {
+                                .i_addr = (void *)&magic,
+                                .i_len = sizeof(magic),
+                                .i_type = XLOG_REG_TYPE_UNMOUNT,
+                        };
+                        struct xfs_log_vec vec = {
+                                .lv_niovecs = 1,
+                                .lv_iovecp = &reg,
+                        };
                        /* remove inited flag */
-                        ((xlog_ticket_t *)tic)->t_flags = 0;
+                        tic->t_flags = 0;
-                        error = xlog_write(mp, reg, 1, tic, &lsn,
+                        error = xlog_write(log, &vec, tic, &lsn,
                                           NULL, XLOG_UNMOUNT_TRANS);
                        /*
                         * At this point, we're umounting anyway,
@@ -648,10 +653,30 @@ xfs_log_unmount(xfs_mount_t *mp)
        xlog_dealloc_log(mp->m_log);
 }
+void
+xfs_log_item_init(
+        struct xfs_mount        *mp,
+        struct xfs_log_item     *item,
+        int                     type,
+        struct xfs_item_ops     *ops)
+{
+        item->li_mountp = mp;
+        item->li_ailp = mp->m_ail;
+        item->li_type = type;
+        item->li_ops = ops;
+        item->li_lv = NULL;
+        INIT_LIST_HEAD(&item->li_ail);
+        INIT_LIST_HEAD(&item->li_cil);
+}
 /*
 * Write region vectors to log.  The write happens using the space reservation
 * of the ticket (tic).  It is not a requirement that all writes for a given
- * transaction occur with one call to xfs_log_write().
+ * transaction occur with one call to xfs_log_write(). However, it is important
+ * to note that the transaction reservation code makes an assumption about the
+ * number of log headers a transaction requires that may be violated if you
+ * don't pass all the transaction vectors in one call....
 */
 int
 xfs_log_write(
@@ -663,11 +688,15 @@ xfs_log_write(
 {
        struct log              *log = mp->m_log;
        int                     error;
+        struct xfs_log_vec      vec = {
+                .lv_niovecs = nentries,
+                .lv_iovecp = reg,
+        };
        if (XLOG_FORCED_SHUTDOWN(log))
                return XFS_ERROR(EIO);
-        error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
+        error = xlog_write(log, &vec, tic, start_lsn, NULL, 0);
        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
        return error;
@@ -1020,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        int                     i;
        int                     iclogsize;
        int                     error = ENOMEM;
+        uint                    log2_size = 0;
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
@@ -1045,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t	*mp,
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
-                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
+                log2_size = mp->m_sb.sb_logsectlog;
-                if (log->l_sectbb_log < 0 ||
+                if (log2_size < BBSHIFT) {
-                    log->l_sectbb_log > mp->m_sectbb_log) {
+                        xlog_warn("XFS: Log sector size too small "
-                        xlog_warn("XFS: Log sector size (0x%x) out of range.",
+                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
-                                                log->l_sectbb_log);
                        goto out_free_log;
                }
-                /* for larger sector sizes, must have v2 or external log */
+                log2_size -= BBSHIFT;
-                if (log->l_sectbb_log != 0 &&
+                if (log2_size > mp->m_sectbb_log) {
-                    (log->l_logBBstart != 0 &&
+                        xlog_warn("XFS: Log sector size too large "
-                     !xfs_sb_version_haslogv2(&mp->m_sb))) {
+                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
-                                  "for configuration.", log->l_sectbb_log);
                        goto out_free_log;
                }
-                if (mp->m_sb.sb_logsectlog < BBSHIFT) {
-                        xlog_warn("XFS: Log sector log (0x%x) too small.",
+                /* for larger sector sizes, must have v2 or external log */
-                                                mp->m_sb.sb_logsectlog);
+                if (log2_size && log->l_logBBstart > 0 &&
+                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                                  "for configuration.", log2_size);
                        goto out_free_log;
                }
        }
-        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+        log->l_sectBBsize = 1 << log2_size;
        xlog_get_iclog_buffer_size(mp, log);
@@ -1147,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        *iclogp = log->l_iclog;                 /* complete ring */
        log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
+        error = xlog_cil_init(log);
+        if (error)
+                goto out_free_iclog;
        return log;
 out_free_iclog:
@@ -1174,26 +1208,31 @@ out:
 * ticket.  Return the lsn of the commit record.
 */
 STATIC int
-xlog_commit_record(xfs_mount_t  *mp,
+xlog_commit_record(
-                   xlog_ticket_t *ticket,
+        struct log              *log,
-                   xlog_in_core_t **iclog,
+        struct xlog_ticket      *ticket,
-                   xfs_lsn_t    *commitlsnp)
+        struct xlog_in_core     **iclog,
+        xfs_lsn_t               *commitlsnp)
 {
-        int             error;
+        struct xfs_mount *mp = log->l_mp;
-        xfs_log_iovec_t reg[1];
+        int     error;
+        struct xfs_log_iovec reg = {
-        reg[0].i_addr = NULL;
+                .i_addr = NULL,
-        reg[0].i_len = 0;
+                .i_len = 0,
-        reg[0].i_type = XLOG_REG_TYPE_COMMIT;
+                .i_type = XLOG_REG_TYPE_COMMIT,
+        };
+        struct xfs_log_vec vec = {
+                .lv_niovecs = 1,
+                .lv_iovecp = &reg,
+        };
        ASSERT_ALWAYS(iclog);
-        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+        error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
-                               iclog, XLOG_COMMIT_TRANS))) {
+                                        XLOG_COMMIT_TRANS);
+        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-        }
        return error;
-}       /* xlog_commit_record */
+}
 /*
 * Push on the buffer cache code if we ever use more than 75% of the on-disk
@@ -1468,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
        xlog_in_core_t  *iclog, *next_iclog;
        int             i;
+        xlog_cil_destroy(log);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                sv_destroy(&iclog->ic_force_wait);
@@ -1510,8 +1551,10 @@ xlog_state_finish_copy(xlog_t		*log,
 * print out info relating to regions written which consume
 * the reservation
 */
-STATIC void
+void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+xlog_print_tic_res(
+        struct xfs_mount        *mp,
+        struct xlog_ticket      *ticket)
 {
        uint i;
        uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1611,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
+        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+                "xfs_log_write: reservation ran out. Need to up reservation");
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+}
+/*
+ * Calculate the potential space needed by the log vector.  Each region gets
+ * its own xlog_op_header_t and may need to be double word aligned.
+ */
+static int
+xlog_write_calc_vec_length(
+        struct xlog_ticket      *ticket,
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        int                     headers = 0;
+        int                     len = 0;
+        int                     i;
+        /* acct for start rec of xact */
+        if (ticket->t_flags & XLOG_TIC_INITED)
+                headers++;
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                headers += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++) {
+                        struct xfs_log_iovec    *vecp = &lv->lv_iovecp[i];
+                        len += vecp->i_len;
+                        xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
+                }
+        }
+        ticket->t_res_num_ophdrs += headers;
+        len += headers * sizeof(struct xlog_op_header);
+        return len;
+}
+/*
+ * If first write for transaction, insert start record  We can't be trying to
+ * commit if we are inited.  We can't have any "partial_copy" if we are inited.
+ */
+static int
+xlog_write_start_rec(
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket)
+{
+        if (!(ticket->t_flags & XLOG_TIC_INITED))
+                return 0;
+        ophdr->oh_tid   = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_len = 0;
+        ophdr->oh_flags = XLOG_START_TRANS;
+        ophdr->oh_res2 = 0;
+        ticket->t_flags &= ~XLOG_TIC_INITED;
+        return sizeof(struct xlog_op_header);
+}
+static xlog_op_header_t *
+xlog_write_setup_ophdr(
+        struct log              *log,
+        struct xlog_op_header   *ophdr,
+        struct xlog_ticket      *ticket,
+        uint                    flags)
+{
+        ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+        ophdr->oh_clientid = ticket->t_clientid;
+        ophdr->oh_res2 = 0;
+        /* are we copying a commit or unmount record? */
+        ophdr->oh_flags = flags;
+        /*
+         * We've seen logs corrupted with bad transaction client ids.  This
+         * makes sure that XFS doesn't generate them on.  Turn this into an EIO
+         * and shut down the filesystem.
+         */
+        switch (ophdr->oh_clientid)  {
+        case XFS_TRANSACTION:
+        case XFS_VOLUME:
+        case XFS_LOG:
+                break;
+        default:
+                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        ophdr->oh_clientid, ticket);
+                return NULL;
+        }
+        return ophdr;
+}
+/*
+ * Set up the parameters of the region copy into the log. This has
+ * to handle region write split across multiple log buffers - this
+ * state is kept external to this function so that this code can
+ * can be written in an obvious, self documenting manner.
+ */
+static int
+xlog_write_setup_copy(
+        struct xlog_ticket      *ticket,
+        struct xlog_op_header   *ophdr,
+        int                     space_available,
+        int                     space_required,
+        int                     *copy_off,
+        int                     *copy_len,
+        int                     *last_was_partial_copy,
+        int                     *bytes_consumed)
+{
+        int                     still_to_copy;
+        still_to_copy = space_required - *bytes_consumed;
+        *copy_off = *bytes_consumed;
+        if (still_to_copy <= space_available) {
+                /* write of region completes here */
+                *copy_len = still_to_copy;
+                ophdr->oh_len = cpu_to_be32(*copy_len);
+                if (*last_was_partial_copy)
+                        ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+                *last_was_partial_copy = 0;
+                *bytes_consumed = 0;
+                return 0;
+        }
+        /* partial write of region, needs extra log op header reservation */
+        *copy_len = space_available;
+        ophdr->oh_len = cpu_to_be32(*copy_len);
+        ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+        if (*last_was_partial_copy)
+                ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
+        *bytes_consumed += *copy_len;
+        (*last_was_partial_copy)++;
+        /* account for new log op header */
+        ticket->t_curr_res -= sizeof(struct xlog_op_header);
+        ticket->t_res_num_ophdrs++;
+        return sizeof(struct xlog_op_header);
+}
+static int
+xlog_write_copy_finish(
+        struct log              *log,
+        struct xlog_in_core     *iclog,
+        uint                    flags,
+        int                     *record_cnt,
+        int                     *data_cnt,
+        int                     *partial_copy,
+        int                     *partial_copy_len,
+        int                     log_offset,
+        struct xlog_in_core     **commit_iclog)
+{
+        if (*partial_copy) {
+                /*
+                 * This iclog has already been marked WANT_SYNC by
+                 * xlog_state_get_iclog_space.
+                 */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                return xlog_state_release_iclog(log, iclog);
+        }
+        *partial_copy = 0;
+        *partial_copy_len = 0;
+        if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+                /* no more space in this iclog - push it. */
+                xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+                *record_cnt = 0;
+                *data_cnt = 0;
+                spin_lock(&log->l_icloglock);
+                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
+                if (!commit_iclog)
+                        return xlog_state_release_iclog(log, iclog);
+                ASSERT(flags & XLOG_COMMIT_TRANS);
+                *commit_iclog = iclog;
+        }
+        return 0;
 }
 /*
@@ -1653,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
 *      we don't update ic_offset until the end when we know exactly how many
 *      bytes have been written out.
 */
-STATIC int
+int
 xlog_write(
-        struct xfs_mount        *mp,
+        struct log              *log,
-        struct xfs_log_iovec    reg[],
+        struct xfs_log_vec      *log_vector,
-        int                     nentries,
        struct xlog_ticket      *ticket,
        xfs_lsn_t               *start_lsn,
        struct xlog_in_core     **commit_iclog,
        uint                    flags)
 {
-    xlog_t           *log = mp->m_log;
+        struct xlog_in_core     *iclog = NULL;
-    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
+        struct xfs_log_iovec    *vecp;
-    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+        struct xfs_log_vec      *lv;
-    __psint_t        ptr;            /* copy address into data region */
+        int                     len;
-    int              len;            /* # xlog_write() bytes 2 still copy */
+        int                     index;
-    int              index;          /* region index currently copying */
+        int                     partial_copy = 0;
-    int              log_offset;     /* offset (from 0) into data region */
+        int                     partial_copy_len = 0;
-    int              start_rec_copy; /* # bytes to copy for start record */
+        int                     contwr = 0;
-    int              partial_copy;   /* did we split a region? */
+        int                     record_cnt = 0;
-    int              partial_copy_len;/* # bytes copied if split region */
+        int                     data_cnt = 0;
-    int              need_copy;      /* # bytes need to memcpy this region */
+        int                     error;
-    int              copy_len;       /* # bytes actually memcpy'ing */
-    int              copy_off;       /* # bytes from entry start */
-    int              contwr;         /* continued write of in-core log? */
-    int              error;
-    int              record_cnt = 0, data_cnt = 0;
-    partial_copy_len = partial_copy = 0;
-    /* Calculate potential maximum space.  Each region gets its own
-     * xlog_op_header_t and may need to be double word aligned.
-     */
-    len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
-        len += sizeof(xlog_op_header_t);
-        ticket->t_res_num_ophdrs++;
-    }
-    for (index = 0; index < nentries; index++) {
-        len += sizeof(xlog_op_header_t);            /* each region gets >= 1 */
-        ticket->t_res_num_ophdrs++;
-        len += reg[index].i_len;
-        xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type);
-    }
-    contwr = *start_lsn = 0;
-    if (ticket->t_curr_res < len) {
+        *start_lsn = 0;
-        xlog_print_tic_res(mp, ticket);
-#ifdef DEBUG
-        xlog_panic(
-                "xfs_log_write: reservation ran out. Need to up reservation");
-#else
-        /* Customer configurable panic */
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
-                "xfs_log_write: reservation ran out. Need to up reservation");
-        /* If we did not panic, shutdown the filesystem */
-        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-#endif
-    } else
-        ticket->t_curr_res -= len;
-    for (index = 0; index < nentries; ) {
+        len = xlog_write_calc_vec_length(ticket, log_vector);
-        if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+        if (log->l_cilp) {
-                                               &contwr, &log_offset)))
+                /*
-                return error;
+                 * Region headers and bytes are already accounted for.
+                 * We only need to take into account start records and
+                 * split regions in this function.
+                 */
+                if (ticket->t_flags & XLOG_TIC_INITED)
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
-        ASSERT(log_offset <= iclog->ic_size - 1);
+                /*
-        ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+                 * Commit record headers need to be accounted for. These
+                 * come in as separate writes so are easy to detect.
+                 */
+                if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
+        } else
+                ticket->t_curr_res -= len;
+        if (ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, ticket);
+        index = 0;
+        lv = log_vector;
+        vecp = lv->lv_iovecp;
+        while (lv && index < lv->lv_niovecs) {
+                void            *ptr;
+                int             log_offset;
+                error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+                                                   &contwr, &log_offset);
+                if (error)
+                        return error;
-        /* start_lsn is the first lsn written to. That's all we need. */
+                ASSERT(log_offset <= iclog->ic_size - 1);
-        if (! *start_lsn)
+                ptr = iclog->ic_datap + log_offset;
-            *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        /* This loop writes out as many regions as can fit in the amount
+                /* start_lsn is the first lsn written to. That's all we need. */
-         * of space which was allocated by xlog_state_get_iclog_space().
+                if (!*start_lsn)
-         */
+                        *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
-        while (index < nentries) {
-            ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
-            ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
-            start_rec_copy = 0;
-            /* If first write for transaction, insert start record.
-             * We can't be trying to commit if we are inited.  We can't
-             * have any "partial_copy" if we are inited.
-             */
-            if (ticket->t_flags & XLOG_TIC_INITED) {
-                logop_head              = (xlog_op_header_t *)ptr;
-                logop_head->oh_tid      = cpu_to_be32(ticket->t_tid);
-                logop_head->oh_clientid = ticket->t_clientid;
-                logop_head->oh_len      = 0;
-                logop_head->oh_flags    = XLOG_START_TRANS;
-                logop_head->oh_res2     = 0;
-                ticket->t_flags         &= ~XLOG_TIC_INITED;    /* clear bit */
-                record_cnt++;
-                start_rec_copy = sizeof(xlog_op_header_t);
-                xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
-            }
-            /* Copy log operation header directly into data section */
+                /*
-            logop_head                  = (xlog_op_header_t *)ptr;
+                 * This loop writes out as many regions as can fit in the amount
-            logop_head->oh_tid          = cpu_to_be32(ticket->t_tid);
+                 * of space which was allocated by xlog_state_get_iclog_space().
-            logop_head->oh_clientid     = ticket->t_clientid;
+                 */
-            logop_head->oh_res2         = 0;
+                while (lv && index < lv->lv_niovecs) {
+                        struct xfs_log_iovec    *reg = &vecp[index];
+                        struct xlog_op_header   *ophdr;
+                        int                     start_rec_copy;
+                        int                     copy_len;
+                        int                     copy_off;
+                        ASSERT(reg->i_len % sizeof(__int32_t) == 0);
+                        ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
+                        start_rec_copy = xlog_write_start_rec(ptr, ticket);
+                        if (start_rec_copy) {
+                                record_cnt++;
+                                xlog_write_adv_cnt(&ptr, &len, &log_offset,
+                                                   start_rec_copy);
+                        }
-            /* header copied directly */
+                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
-            xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+                        if (!ophdr)
+                                return XFS_ERROR(EIO);
-            /* are we copying a commit or unmount record? */
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset,
-            logop_head->oh_flags = flags;
+                                           sizeof(struct xlog_op_header));
+                        len += xlog_write_setup_copy(ticket, ophdr,
+                                                     iclog->ic_size-log_offset,
+                                                     reg->i_len,
+                                                     &copy_off, &copy_len,
+                                                     &partial_copy,
+                                                     &partial_copy_len);
+                        xlog_verify_dest_ptr(log, ptr);
+                        /* copy region */
+                        ASSERT(copy_len >= 0);
+                        memcpy(ptr, reg->i_addr + copy_off, copy_len);
+                        xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
+                        copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+                        record_cnt++;
+                        data_cnt += contwr ? copy_len : 0;
+                        error = xlog_write_copy_finish(log, iclog, flags,
+                                                       &record_cnt, &data_cnt,
+                                                       &partial_copy,
+                                                       &partial_copy_len,
+                                                       log_offset,
+                                                       commit_iclog);
+                        if (error)
+                                return error;
-            /*
+                        /*
-             * We've seen logs corrupted with bad transaction client
+                         * if we had a partial copy, we need to get more iclog
-             * ids.  This makes sure that XFS doesn't generate them on.
+                         * space but we don't want to increment the region
-             * Turn this into an EIO and shut down the filesystem.
+                         * index because there is still more is this region to
-             */
+                         * write.
-            switch (logop_head->oh_clientid)  {
+                         *
-            case XFS_TRANSACTION:
+                         * If we completed writing this region, and we flushed
-            case XFS_VOLUME:
+                         * the iclog (indicated by resetting of the record
-            case XFS_LOG:
+                         * count), then we also need to get more log space. If
-                break;
+                         * this was the last record, though, we are done and
-            default:
+                         * can just return.
-                xfs_fs_cmn_err(CE_WARN, mp,
+                         */
-                    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                        if (partial_copy)
-                    logop_head->oh_clientid, ticket);
+                                break;
-                return XFS_ERROR(EIO);
-            }
-            /* Partial write last time? => (partial_copy != 0)
+                        if (++index == lv->lv_niovecs) {
-             * need_copy is the amount we'd like to copy if everything could
+                                lv = lv->lv_next;
-             * fit in the current memcpy.
+                                index = 0;
-             */
+                                if (lv)
-            need_copy = reg[index].i_len - partial_copy_len;
+                                        vecp = lv->lv_iovecp;
+                        }
-            copy_off = partial_copy_len;
+                        if (record_cnt == 0) {
-            if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+                                if (!lv)
-                copy_len = need_copy;
+                                        return 0;
-                logop_head->oh_len = cpu_to_be32(copy_len);
+                                break;
-                if (partial_copy)
+                        }
-                    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
-                partial_copy_len = partial_copy = 0;
-            } else {                                        /* partial write */
-                copy_len = iclog->ic_size - log_offset;
-                logop_head->oh_len = cpu_to_be32(copy_len);
-                logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
-                if (partial_copy)
-                        logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
-                partial_copy_len += copy_len;
-                partial_copy++;
-                len += sizeof(xlog_op_header_t); /* from splitting of region */
-                /* account for new log op header */
-                ticket->t_curr_res -= sizeof(xlog_op_header_t);
-                ticket->t_res_num_ophdrs++;
-            }
-            xlog_verify_dest_ptr(log, ptr);
-            /* copy region */
-            ASSERT(copy_len >= 0);
-            memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
-            xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
-            /* make copy_len total bytes copied, including headers */
-            copy_len += start_rec_copy + sizeof(xlog_op_header_t);
-            record_cnt++;
-            data_cnt += contwr ? copy_len : 0;
-            if (partial_copy) {                 /* copied partial region */
-                    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    if ((error = xlog_state_release_iclog(log, iclog)))
-                            return error;
-                    break;                      /* don't increment index */
-            } else {                            /* copied entire region */
-                index++;
-                partial_copy_len = partial_copy = 0;
-                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
-                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-                    record_cnt = data_cnt = 0;
-                    spin_lock(&log->l_icloglock);
-                    xlog_state_want_sync(log, iclog);
-                    spin_unlock(&log->l_icloglock);
-                    if (commit_iclog) {
-                        ASSERT(flags & XLOG_COMMIT_TRANS);
-                        *commit_iclog = iclog;
-                    } else if ((error = xlog_state_release_iclog(log, iclog)))
-                           return error;
-                    if (index == nentries)
-                            return 0;           /* we are done */
-                    else
-                            break;
                }
-            } /* if (partial_copy) */
+        }
-        } /* while (index < nentries) */
-    } /* for (index = 0; index < nentries; ) */
+        ASSERT(len == 0);
-    ASSERT(len == 0);
+        xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+        if (!commit_iclog)
+                return xlog_state_release_iclog(log, iclog);
-    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-    if (commit_iclog) {
        ASSERT(flags & XLOG_COMMIT_TRANS);
        *commit_iclog = iclog;
        return 0;
-    }
+}
-    return xlog_state_release_iclog(log, iclog);
-}       /* xlog_write */
 /*****************************************************************************
@@ -2840,6 +3025,8 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
+        xlog_cil_push(log, 1);
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -2989,6 +3176,12 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
+        if (log->l_cilp) {
+                lsn = xlog_cil_push_lsn(log, lsn);
+                if (lsn == NULLCOMMITLSN)
+                        return 0;
+        }
 try_again:
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -3153,20 +3346,30 @@ xfs_log_ticket_get(
        return ticket;
 }
+xlog_tid_t
+xfs_log_get_trans_ident(
+        struct xfs_trans        *tp)
+{
+        return tp->t_ticket->t_tid;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
-STATIC xlog_ticket_t *
+xlog_ticket_t *
-xlog_ticket_alloc(xlog_t                *log,
+xlog_ticket_alloc(
-                int             unit_bytes,
+        struct log      *log,
-                int             cnt,
+        int             unit_bytes,
-                char            client,
+        int             cnt,
-                uint            xflags)
+        char            client,
+        uint            xflags,
+        int             alloc_flags)
 {
-        xlog_ticket_t   *tic;
+        struct xlog_ticket *tic;
        uint            num_headers;
+        int             iclog_space;
-        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+        tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
        if (!tic)
                return NULL;
@@ -3208,16 +3411,40 @@ xlog_ticket_alloc(xlog_t		*log,
        /* for start-rec */
        unit_bytes += sizeof(xlog_op_header_t);
-        /* for LR headers */
+        /*
-        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+         * for LR headers - the space for data in an iclog is the size minus
+         * the space used for the headers. If we use the iclog size, then we
+         * undercalculate the number of headers required.
+         *
+         * Furthermore - the addition of op headers for split-recs might
+         * increase the space required enough to require more log and op
+         * headers, so take that into account too.
+         *
+         * IMPORTANT: This reservation makes the assumption that if this
+         * transaction is the first in an iclog and hence has the LR headers
+         * accounted to it, then the remaining space in the iclog is
+         * exclusively for this transaction.  i.e. if the transaction is larger
+         * than the iclog, it will be the only thing in that iclog.
+         * Fundamentally, this means we must pass the entire log vector to
+         * xlog_write to guarantee this.
+         */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        num_headers = howmany(unit_bytes, iclog_space);
+        /* for split-recs - ophdrs added when data split over LRs */
+        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+        /* add extra header reservations if we overrun */
+        while (!num_headers ||
+               howmany(unit_bytes, iclog_space) > num_headers) {
+                unit_bytes += sizeof(xlog_op_header_t);
+                num_headers++;
+        }
        unit_bytes += log->l_iclog_hsize * num_headers;
        /* for commit-rec LR header - note: padding will subsume the ophdr */
        unit_bytes += log->l_iclog_hsize;
-        /* for split-recs - ophdrs added when data split over LRs */
-        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
        /* for roundoff padding for transaction data and one for commit record */
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
            log->l_mp->m_sb.sb_logsunit > 1) {
@@ -3233,13 +3460,13 @@ xlog_ticket_alloc(xlog_t		*log,
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
        tic->t_ocnt             = cnt;
-        tic->t_tid              = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+        tic->t_tid              = random32();
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
+        sv_init(&tic->t_wait, SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3260,20 +3487,22 @@ xlog_ticket_alloc(xlog_t		*log,
 * part of the log in case we trash the log structure.
 */
 void
-xlog_verify_dest_ptr(xlog_t     *log,
+xlog_verify_dest_ptr(
-                     __psint_t  ptr)
+        struct log      *log,
+        char            *ptr)
 {
        int i;
        int good_ptr = 0;
-        for (i=0; i < log->l_iclog_bufs; i++) {
+        for (i = 0; i < log->l_iclog_bufs; i++) {
-                if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+                if (ptr >= log->l_iclog_bak[i] &&
-                    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+                    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
                        good_ptr++;
        }
-        if (! good_ptr)
+        if (!good_ptr)
                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
-}       /* xlog_verify_dest_ptr */
+}
 STATIC void
 xlog_verify_grant_head(xlog_t *log, int equals)
@@ -3459,6 +3688,11 @@ xlog_state_ioerror(
 *      c. nothing new gets queued up after (a) and (b) are done.
 *      d. if !logerror, flush the iclogs to disk, then seal them off
 *         for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
 */
 int
 xfs_log_force_umount(
@@ -3492,6 +3726,16 @@ xfs_log_force_umount(
                return 1;
        }
        retval = 0;
+        /*
+         * Flush the in memory commit item list before marking the log as
+         * being shut down. We need to do it in this order to ensure all the
+         * completed transactions are flushed to disk with the xfs_log_force()
+         * call below.
+         */
+        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+                xlog_cil_push(log, 1);
        /*
         * We must hold both the GRANT lock and the LOG lock,
         * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 97a24c7795a4..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_H__
 /* get lsn fields */
 #define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
 #define BLOCK_LSN(lsn) ((uint)(lsn))
@@ -110,6 +109,15 @@ typedef struct xfs_log_iovec {
        uint            i_type;         /* type of region */
 } xfs_log_iovec_t;
+struct xfs_log_vec {
+        struct xfs_log_vec      *lv_next;       /* next lv in build list */
+        int                     lv_niovecs;     /* number of iovecs in lv */
+        struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+        struct xfs_log_item     *lv_item;       /* owner */
+        char                    *lv_buf;        /* formatted buffer */
+        int                     lv_buf_len;     /* size of formatted buffer */
+};
 /*
 * Structure used to pass callback function and the function's argument
 * to the log manager.
@@ -126,6 +134,14 @@ typedef struct xfs_log_callback {
 struct xfs_mount;
 struct xlog_in_core;
 struct xlog_ticket;
+struct xfs_log_item;
+struct xfs_item_ops;
+struct xfs_trans;
+void    xfs_log_item_init(struct xfs_mount      *mp,
+                        struct xfs_log_item     *item,
+                        int                     type,
+                        struct xfs_item_ops     *ops);
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       struct xlog_ticket *ticket,
@@ -174,9 +190,16 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+                                struct xfs_log_vec *log_vector,
+                                xfs_lsn_t *commit_lsn, int flags);
+bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 #endif
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+        struct log      *log)
+{
+        struct xfs_cil  *cil;
+        struct xfs_cil_ctx *ctx;
+        log->l_cilp = NULL;
+        if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+                return 0;
+        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+        if (!cil)
+                return ENOMEM;
+        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+        if (!ctx) {
+                kmem_free(cil);
+                return ENOMEM;
+        }
+        INIT_LIST_HEAD(&cil->xc_cil);
+        INIT_LIST_HEAD(&cil->xc_committing);
+        spin_lock_init(&cil->xc_cil_lock);
+        init_rwsem(&cil->xc_ctx_lock);
+        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        INIT_LIST_HEAD(&ctx->committing);
+        INIT_LIST_HEAD(&ctx->busy_extents);
+        ctx->sequence = 1;
+        ctx->cil = cil;
+        cil->xc_ctx = ctx;
+        cil->xc_log = log;
+        log->l_cilp = cil;
+        return 0;
+}
+void
+xlog_cil_destroy(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        if (log->l_cilp->xc_ctx) {
+                if (log->l_cilp->xc_ctx->ticket)
+                        xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+                kmem_free(log->l_cilp->xc_ctx);
+        }
+        ASSERT(list_empty(&log->l_cilp->xc_cil));
+        kmem_free(log->l_cilp);
+}
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+        struct log      *log)
+{
+        struct xlog_ticket *tic;
+        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+                                KM_SLEEP|KM_NOFS);
+        tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+        /*
+         * set the current reservation to zero so we know to steal the basic
+         * transaction overhead reservation from the first transaction commit.
+         */
+        tic->t_curr_res = 0;
+        return tic;
+}
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+        log->l_cilp->xc_ctx->sequence = 1;
+        log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+                                                                log->l_curr_block);
+}
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+        struct log              *log,
+        struct xlog_ticket      *ticket,
+        struct xfs_log_item     *item,
+        struct xfs_log_vec      *lv)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
+        int                     len;
+        int                     diff_iovecs;
+        int                     iclog_space;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                len = lv->lv_buf_len - old->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&item->li_cil));
+                len = lv->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        spin_lock(&cil->xc_cil_lock);
+        list_move_tail(&item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * If this is the first time the item is being committed to the CIL,
+         * store the sequence number on the log item so we can tell
+         * in future commits whether this is the first checkpoint the item is
+         * being committed into.
+         */
+        if (!item->li_seq)
+                item->li_seq = ctx->sequence;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
+}
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                void    *ptr;
+                int     index;
+                int     len = 0;
+                /* build the vector array and calculate it's length */
+                IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+                for (index = 0; index < lv->lv_niovecs; index++)
+                        len += lv->lv_iovecp[index].i_len;
+                lv->lv_buf_len = len;
+                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                ptr = lv->lv_buf;
+                for (index = 0; index < lv->lv_niovecs; index++) {
+                        struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+                        memcpy(ptr, vec->i_addr, vec->i_len);
+                        vec->i_addr = ptr;
+                        ptr += vec->i_len;
+                }
+                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+        }
+}
+static void
+xlog_cil_free_logvec(
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        for (lv = log_vector; lv; ) {
+                struct xfs_log_vec *next = lv->lv_next;
+                kmem_free(lv->lv_buf);
+                kmem_free(lv);
+                lv = next;
+        }
+}
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+        void    *args,
+        int     abort)
+{
+        struct xfs_cil_ctx      *ctx = args;
+        struct xfs_log_vec      *lv;
+        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
+        struct xfs_busy_extent  *busyp, *n;
+        /* unpin all the log items */
+        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+                                                        abortflag);
+        }
+        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        spin_lock(&ctx->cil->xc_cil_lock);
+        list_del(&ctx->committing);
+        spin_unlock(&ctx->cil->xc_cil_lock);
+        xlog_cil_free_logvec(ctx->lv_chain);
+        kmem_free(ctx);
+}
+/*
+ * Push the Committed Item List to the log. If the push_now flag is not set,
+ * then it is a background flush and so we can chose to ignore it.
+ */
+int
+xlog_cil_push(
+        struct log              *log,
+        int                     push_now)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *lv;
+        struct xfs_cil_ctx      *ctx;
+        struct xfs_cil_ctx      *new_ctx;
+        struct xlog_in_core     *commit_iclog;
+        struct xlog_ticket      *tic;
+        int                     num_lv;
+        int                     num_iovecs;
+        int                     len;
+        int                     error = 0;
+        struct xfs_trans_header thdr;
+        struct xfs_log_iovec    lhdr;
+        struct xfs_log_vec      lvhdr = { NULL };
+        xfs_lsn_t               commit_lsn;
+        if (!cil)
+                return 0;
+        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+        new_ctx->ticket = xlog_cil_ticket_alloc(log);
+        /* lock out transaction commit, but don't block on background push */
+        if (!down_write_trylock(&cil->xc_ctx_lock)) {
+                if (!push_now)
+                        goto out_free_ticket;
+                down_write(&cil->xc_ctx_lock);
+        }
+        ctx = cil->xc_ctx;
+        /* check if we've anything to push */
+        if (list_empty(&cil->xc_cil))
+                goto out_skip;
+        /* check for spurious background flush */
+        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /*
+         * pull all the log vectors off the items in the CIL, and
+         * remove the items from the CIL. We don't need the CIL lock
+         * here because it's only needed on the transaction commit
+         * side which is currently locked out by the flush lock.
+         */
+        lv = NULL;
+        num_lv = 0;
+        num_iovecs = 0;
+        len = 0;
+        while (!list_empty(&cil->xc_cil)) {
+                struct xfs_log_item     *item;
+                int                     i;
+                item = list_first_entry(&cil->xc_cil,
+                                        struct xfs_log_item, li_cil);
+                list_del_init(&item->li_cil);
+                if (!ctx->lv_chain)
+                        ctx->lv_chain = item->li_lv;
+                else
+                        lv->lv_next = item->li_lv;
+                lv = item->li_lv;
+                item->li_lv = NULL;
+                num_lv++;
+                num_iovecs += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++)
+                        len += lv->lv_iovecp[i].i_len;
+        }
+        /*
+         * initialise the new context and attach it to the CIL. Then attach
+         * the current context to the CIL committing lsit so it can be found
+         * during log forces to extract the commit lsn of the sequence that
+         * needs to be forced.
+         */
+        INIT_LIST_HEAD(&new_ctx->committing);
+        INIT_LIST_HEAD(&new_ctx->busy_extents);
+        new_ctx->sequence = ctx->sequence + 1;
+        new_ctx->cil = cil;
+        cil->xc_ctx = new_ctx;
+        /*
+         * The switch is now done, so we can drop the context lock and move out
+         * of a shared context. We can't just go straight to the commit record,
+         * though - we need to synchronise with previous and future commits so
+         * that the commit records are correctly ordered in the log to ensure
+         * that we process items during log IO completion in the correct order.
+         *
+         * For example, if we get an EFI in one checkpoint and the EFD in the
+         * next (e.g. due to log forces), we do not want the checkpoint with
+         * the EFD to be committed before the checkpoint with the EFI.  Hence
+         * we must strictly order the commit records of the checkpoints so
+         * that: a) the checkpoint callbacks are attached to the iclogs in the
+         * correct order; and b) the checkpoints are replayed in correct order
+         * in log recovery.
+         *
+         * Hence we need to add this context to the committing context list so
+         * that higher sequences will wait for us to write out a commit record
+         * before they do.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        list_add(&ctx->committing, &cil->xc_committing);
+        spin_unlock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        /*
+         * Build a checkpoint transaction header and write it to the log to
+         * begin the transaction. We need to account for the space used by the
+         * transaction header here as it is not accounted for in xlog_write().
+         *
+         * The LSN we need to pass to the log items on transaction commit is
+         * the LSN reported by the first log vector write. If we use the commit
+         * record lsn then we can move the tail beyond the grant write head.
+         */
+        tic = ctx->ticket;
+        thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+        thdr.th_type = XFS_TRANS_CHECKPOINT;
+        thdr.th_tid = tic->t_tid;
+        thdr.th_num_items = num_iovecs;
+        lhdr.i_addr = (xfs_caddr_t)&thdr;
+        lhdr.i_len = sizeof(xfs_trans_header_t);
+        lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+        tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+        lvhdr.lv_niovecs = 1;
+        lvhdr.lv_iovecp = &lhdr;
+        lvhdr.lv_next = ctx->lv_chain;
+        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+        if (error)
+                goto out_abort;
+        /*
+         * now that we've written the checkpoint into the log, strictly
+         * order the commit records so replay will get them in the right order.
+         */
+restart:
+        spin_lock(&cil->xc_cil_lock);
+        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+                /*
+                 * Higher sequences will wait for this one so skip them.
+                 * Don't wait for own own sequence, either.
+                 */
+                if (new_ctx->sequence >= ctx->sequence)
+                        continue;
+                if (!new_ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+        if (error || commit_lsn == -1)
+                goto out_abort;
+        /* attach all the transactions w/ busy extents to iclog */
+        ctx->log_cb.cb_func = xlog_cil_committed;
+        ctx->log_cb.cb_arg = ctx;
+        error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+        if (error)
+                goto out_abort;
+        /*
+         * now the checkpoint commit is complete and we've attached the
+         * callbacks to the iclog we can assign the commit LSN to the context
+         * and wake up anyone who is waiting for the commit to complete.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        ctx->commit_lsn = commit_lsn;
+        sv_broadcast(&cil->xc_commit_wait);
+        spin_unlock(&cil->xc_cil_lock);
+        /* release the hounds! */
+        return xfs_log_release_iclog(log->l_mp, commit_iclog);
+out_skip:
+        up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+        xfs_log_ticket_put(new_ctx->ticket);
+        kmem_free(new_ctx);
+        return 0;
+out_abort:
+        xlog_cil_committed(ctx, XFS_LI_ABORTED);
+        return XFS_ERROR(EIO);
+}
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_push_lsn(
+        struct log      *log,
+        xfs_lsn_t       push_seq)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx;
+        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
+restart:
+        down_write(&cil->xc_ctx_lock);
+        ASSERT(push_seq <= cil->xc_ctx->sequence);
+        /* check to see if we need to force out the current context */
+        if (push_seq == cil->xc_ctx->sequence) {
+                up_write(&cil->xc_ctx_lock);
+                xlog_cil_push(log, 1);
+                goto restart;
+        }
+        /*
+         * See if we can find a previous sequence still committing.
+         * We can drop the flush lock as soon as we have the cil lock
+         * because we are now only comparing contexts protected by
+         * the cil lock.
+         *
+         * We need to wait for all previous sequence commits to complete
+         * before allowing the force of push_seq to go ahead. Hence block
+         * on commits for those as well.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        list_for_each_entry(ctx, &cil->xc_committing, committing) {
+                if (ctx->sequence > push_seq)
+                        continue;
+                if (!ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+                if (ctx->sequence != push_seq)
+                        continue;
+                /* found it! */
+                commit_lsn = ctx->commit_lsn;
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        return commit_lsn;
+}
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+        struct xfs_log_item *lip)
+{
+        struct xfs_cil_ctx *ctx;
+        if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+                return false;
+        if (list_empty(&lip->li_cil))
+                return false;
+        ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+        /*
+         * li_seq is written on the first commit of a log item to record the
+         * first checkpoint it is written to. Hence if it is different to the
+         * current sequence, we're in a new checkpoint.
+         */
+        if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+                return false;
+        return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fd02a18facd5..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
-typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
 } xlog_in_core_t;
 /*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+struct xfs_cil_ctx {
+        struct xfs_cil          *cil;
+        xfs_lsn_t               sequence;       /* chkpt sequence # */
+        xfs_lsn_t               start_lsn;      /* first LSN of chkpt commit */
+        xfs_lsn_t               commit_lsn;     /* chkpt commit record lsn */
+        struct xlog_ticket      *ticket;        /* chkpt ticket */
+        int                     nvecs;          /* number of regions */
+        int                     space_used;     /* aggregate size of regions */
+        struct list_head        busy_extents;   /* busy extents in chkpt */
+        struct xfs_log_vec      *lv_chain;      /* logvecs being pushed */
+        xfs_log_callback_t      log_cb;         /* completion callback hook. */
+        struct list_head        committing;     /* ctx committing list */
+};
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+        struct log              *xc_log;
+        struct list_head        xc_cil;
+        spinlock_t              xc_cil_lock;
+        struct xfs_cil_ctx      *xc_ctx;
+        struct rw_semaphore     xc_ctx_lock;
+        struct list_head        xc_committing;
+        sv_t                    xc_commit_wait;
+};
+/*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)       \
+        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
        struct xfs_ail          *l_ailp;        /* AIL log is working with */
+        struct xfs_cil          *l_cilp;        /* CIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
@@ -396,9 +488,7 @@ typedef struct log {
        struct xfs_buf_cancel   **l_buf_cancel_table;
        int                     l_iclog_hsize;  /* size of iclog header */
        int                     l_iclog_heads;  /* # of iclog header sectors */
-        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectBBsize;   /* sector size in BBs (2^n) */
-        uint                    l_sectbb_mask;  /* sector size (in BBs)
-                                                 * alignment mask */
        int                     l_iclog_size;   /* size of log in bytes */
        int                     l_iclog_size_log; /* log power size of log */
        int                     l_iclog_bufs;   /* number of iclog buffers */
@@ -440,14 +530,40 @@ typedef struct log {
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern kmem_zone_t      *xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+                                int count, char client, uint xflags,
+                                int alloc_flags);
+static inline void
+xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
+{
+        *ptr += bytes;
+        *len -= bytes;
+        *off += bytes;
+}
+void    xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int     xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+                                struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+                                xlog_in_core_t **commit_iclog, uint flags);
+/*
+ * Committed Item List interfaces
+ */
+int     xlog_cil_init(struct log *log);
+void    xlog_cil_init_post_recovery(struct log *log);
+void    xlog_cil_destroy(struct log *log);
+int     xlog_cil_push(struct log *log, int push_now);
+xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 22e6efdc17ea..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -56,33 +56,61 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 #define xlog_recover_check_summary(log)
 #endif
 /*
 * Sector aligned buffer routines for buffer create/read/write/access
 */
-#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)   \
+/*
-        ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
+ * Verify the given count of basic blocks is valid number of blocks
-        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
+ * to specify for an operation involving the given XFS log buffer.
-#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)   ((bno) & ~(log)->l_sectbb_mask)
+ * Returns nonzero if the count is valid, 0 otherwise.
+ */
+static inline int
+xlog_buf_bbcount_valid(
+        xlog_t          *log,
+        int             bbcount)
+{
+        return bbcount > 0 && bbcount <= log->l_logBBsize;
+}
+/*
+ * Allocate a buffer to hold log data.  The buffer needs to be able
+ * to map to a range of nbblks basic blocks at any valid (basic
+ * block) offset within the log.
+ */
 STATIC xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
        int             nbblks)
 {
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_get_bp(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
        }
-        if (log->l_sectbb_log) {
+        /*
-                if (nbblks > 1)
+         * We do log I/O in units of log sectors (a power-of-2
-                        nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+         * multiple of the basic block size), so we round up the
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
+         * requested size to acommodate the basic blocks required
-        }
+         * for complete log sectors.
+         *
+         * In addition, the buffer may be used for a non-sector-
+         * aligned block offset, in which case an I/O of the
+         * requested size could extend beyond the end of the
+         * buffer.  If the requested size is only 1 basic block it
+         * will never straddle a sector boundary, so this won't be
+         * an issue.  Nor will this be a problem if the log I/O is
+         * done in basic blocks (sector size 1).  But otherwise we
+         * extend the buffer by one extra log sector to ensure
+         * there's space to accomodate this possiblility.
+         */
+        if (nbblks > 1 && log->l_sectBBsize > 1)
+                nbblks += log->l_sectBBsize;
+        nbblks = round_up(nbblks, log->l_sectBBsize);
        return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
@@ -93,6 +121,10 @@ xlog_put_bp(
        xfs_buf_free(bp);
 }
+/*
+ * Return the address of the start of the given block number's data
+ * in a log buffer.  The buffer covers a log sector-aligned region.
+ */
 STATIC xfs_caddr_t
 xlog_align(
        xlog_t          *log,
@@ -100,14 +132,14 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
+        xfs_daddr_t     offset;
        xfs_caddr_t     ptr;
-        if (!log->l_sectbb_log)
+        offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
-                return XFS_BUF_PTR(bp);
+        ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+        ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-        ASSERT(XFS_BUF_SIZE(bp) >=
-                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
        return ptr;
 }
@@ -124,21 +156,18 @@ xlog_bread_noalign(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bread(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
-        ASSERT(bp);
        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
        XFS_BUF_READ(bp);
@@ -186,17 +215,15 @@ xlog_bwrite(
 {
        int             error;
-        if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
-                XFS_ERROR_REPORT("xlog_bwrite(1)",
+                        nbblks);
-                                 XFS_ERRLEVEL_HIGH, log->l_mp);
+                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
        }
-        if (log->l_sectbb_log) {
+        blk_no = round_down(blk_no, log->l_sectBBsize);
-                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+        nbblks = round_up(nbblks, log->l_sectBBsize);
-                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-        }
        ASSERT(nbblks > 0);
        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
@@ -327,39 +354,38 @@ xlog_find_cycle_start(
 {
        xfs_caddr_t     offset;
        xfs_daddr_t     mid_blk;
+        xfs_daddr_t     end_blk;
        uint            mid_cycle;
        int             error;
-        mid_blk = BLK_AVG(first_blk, *last_blk);
+        end_blk = *last_blk;
-        while (mid_blk != first_blk && mid_blk != *last_blk) {
+        mid_blk = BLK_AVG(first_blk, end_blk);
+        while (mid_blk != first_blk && mid_blk != end_blk) {
                error = xlog_bread(log, mid_blk, 1, bp, &offset);
                if (error)
                        return error;
                mid_cycle = xlog_get_cycle(offset);
-                if (mid_cycle == cycle) {
+                if (mid_cycle == cycle)
-                        *last_blk = mid_blk;
+                        end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
-                        /* last_half_cycle == mid_cycle */
+                else
-                } else {
+                        first_blk = mid_blk; /* first_half_cycle == mid_cycle */
-                        first_blk = mid_blk;
+                mid_blk = BLK_AVG(first_blk, end_blk);
-                        /* first_half_cycle == mid_cycle */
-                }
-                mid_blk = BLK_AVG(first_blk, *last_blk);
        }
-        ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+        ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
-               (mid_blk == *last_blk && mid_blk-1 == first_blk));
+               (mid_blk == end_blk && mid_blk-1 == first_blk));
+        *last_blk = end_blk;
        return 0;
 }
 /*
- * Check that the range of blocks does not contain the cycle number
+ * Check that a range of blocks does not contain stop_on_cycle_no.
- * given.  The scan needs to occur from front to back and the ptr into the
+ * Fill in *new_blk with the block offset where such a block is
- * region must be updated since a later routine will need to perform another
+ * found, or with -1 (an invalid block number) if there is no such
- * test.  If the region is completely good, we end up returning the same
+ * block in the range.  The scan needs to occur from front to back
- * last block number.
+ * and the pointer into the region must be updated since a later
- *
+ * routine will need to perform another test.
- * Set blkno to -1 if we encounter no errors.  This is an invalid block number
- * since we don't ever expect logs to get this large.
 */
 STATIC int
 xlog_find_verify_cycle(
@@ -376,12 +402,16 @@ xlog_find_verify_cycle(
        xfs_caddr_t     buf = NULL;
        int             error = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks we'll be examining.  If that fails,
+         * try a smaller size.  We need to be able to read at least
+         * a log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(nbblks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
-                /* can't get enough memory to do everything in one big buffer */
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < log->l_sectBBsize)
                        return ENOMEM;
        }
@@ -629,7 +659,7 @@ xlog_find_head(
                 * In this case we want to find the first block with cycle
                 * number matching last_half_cycle.  We expect the log to be
                 * some variation on
-                 *        x + 1 ... | x ...
+                 *        x + 1 ... | x ... | x
                 * The first block with cycle number x (last_half_cycle) will
                 * be where the new head belongs.  First we do a binary search
                 * for the first occurrence of last_half_cycle.  The binary
@@ -639,11 +669,13 @@ xlog_find_head(
                 * the log, then we look for occurrences of last_half_cycle - 1
                 * at the end of the log.  The cases we're looking for look
                 * like
-                 *        x + 1 ... | x | x + 1 | x ...
+                 *                               v binary search stopped here
-                 *                               ^ binary search stopped here
+                 *        x + 1 ... | x | x + 1 | x ... | x
+                 *                   ^ but we want to locate this spot
                 * or
-                 *        x + 1 ... | x ... | x - 1 | x
                 *        <---------> less than scan distance
+                 *        x + 1 ... | x ... | x - 1 | x
+                 *                           ^ we want to locate this spot
                 */
                stop_on_cycle = last_half_cycle;
                if ((error = xlog_find_cycle_start(log, bp, first_blk,
@@ -699,16 +731,16 @@ xlog_find_head(
                 * certainly not the head of the log.  By searching for
                 * last_half_cycle-1 we accomplish that.
                 */
-                start_blk = log_bbnum - num_scan_bblks + head_blk;
                ASSERT(head_blk <= INT_MAX &&
-                        (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+                        (xfs_daddr_t) num_scan_bblks >= head_blk);
+                start_blk = log_bbnum - (num_scan_bblks - head_blk);
                if ((error = xlog_find_verify_cycle(log, start_blk,
                                        num_scan_bblks - (int)head_blk,
                                        (stop_on_cycle - 1), &new_blk)))
                        goto bp_err;
                if (new_blk != -1) {
                        head_blk = new_blk;
-                        goto bad_blk;
+                        goto validate_head;
                }
                /*
@@ -726,7 +758,7 @@ xlog_find_head(
                        head_blk = new_blk;
        }
- bad_blk:
+validate_head:
        /*
         * Now we need to make sure head_blk is not pointing to a block in
         * the middle of a log record.
@@ -748,7 +780,7 @@ xlog_find_head(
                if ((error = xlog_find_verify_log_record(log, start_blk,
                                                        &head_blk, 0)) == -1) {
                        /* We hit the beginning of the log during our search */
-                        start_blk = log_bbnum - num_scan_bblks + head_blk;
+                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
                        new_blk = log_bbnum;
                        ASSERT(start_blk <= INT_MAX &&
                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
@@ -833,12 +865,12 @@ xlog_find_tail(
        if (*head_blk == 0) {                           /* special case */
                error = xlog_bread(log, 0, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
-                        goto exit;
+                        goto done;
                }
        }
@@ -849,7 +881,7 @@ xlog_find_tail(
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
                error = xlog_bread(log, i, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
@@ -866,7 +898,7 @@ xlog_find_tail(
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
                        error = xlog_bread(log, i, 1, bp, &offset);
                        if (error)
-                                goto bread_err;
+                                goto done;
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
@@ -941,7 +973,7 @@ xlog_find_tail(
                umount_data_blk = (i + hblks) % log->l_logBBsize;
                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
                if (error)
-                        goto bread_err;
+                        goto done;
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
@@ -987,12 +1019,10 @@ xlog_find_tail(
         * But... if the -device- itself is readonly, just skip this.
         * We can't recover this device anyway, so it won't matter.
         */
-        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
                error = xlog_clear_stale_blocks(log, tail_lsn);
-        }
-bread_err:
+done:
-exit:
        xlog_put_bp(bp);
        if (error)
@@ -1152,16 +1182,22 @@ xlog_write_log_records(
        xfs_caddr_t     offset;
        xfs_buf_t       *bp;
        int             balign, ealign;
-        int             sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+        int             sectbb = log->l_sectBBsize;
        int             end_block = start_block + blocks;
        int             bufblks;
        int             error = 0;
        int             i, j = 0;
+        /*
+         * Greedily allocate a buffer big enough to handle the full
+         * range of basic blocks to be written.  If that fails, try
+         * a smaller size.  We need to be able to write at least a
+         * log sector, or we're out of luck.
+         */
        bufblks = 1 << ffs(blocks);
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
-                if (bufblks <= log->l_sectbb_log)
+                if (bufblks < sectbb)
                        return ENOMEM;
        }
@@ -1169,7 +1205,7 @@ xlog_write_log_records(
         * the buffer in the starting sector not covered by the first
         * write below.
         */
-        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+        balign = round_down(start_block, sectbb);
        if (balign != start_block) {
                error = xlog_bread_noalign(log, start_block, 1, bp);
                if (error)
@@ -1188,7 +1224,7 @@ xlog_write_log_records(
                 * the buffer in the final sector not covered by the write.
                 * If this is the same sector as the above read, skip it.
                 */
-                ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+                ealign = round_down(end_block, sectbb);
                if (j == 0 && (start_block + endcount > ealign)) {
                        offset = XFS_BUF_PTR(bp);
                        balign = BBTOB(ealign - start_block);
@@ -1408,6 +1444,7 @@ xlog_recover_add_item(
 STATIC int
 xlog_recover_add_to_cont_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans(
        memcpy(&ptr[old_len], dp, len); /* d, s, l */
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
        return 0;
 }
@@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans(
 */
 STATIC int
 xlog_recover_add_to_trans(
+        struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans(
        item->ri_buf[item->ri_cnt].i_addr = ptr;
        item->ri_buf[item->ri_cnt].i_len  = len;
        item->ri_cnt++;
+        trace_xfs_log_recover_item_add(log, trans, item, 0);
        return 0;
 }
@@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans(
 */
 STATIC int
 xlog_recover_reorder_trans(
-        xlog_recover_t          *trans)
+        struct log              *log,
+        xlog_recover_t          *trans,
+        int                     pass)
 {
        xlog_recover_item_t     *item, *n;
        LIST_HEAD(sort_list);
@@ -1534,7 +1576,9 @@ xlog_recover_reorder_trans(
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                        if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+                        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+                                trace_xfs_log_recover_item_reorder_head(log,
+                                                        trans, item, pass);
                                list_move(&item->ri_list, &trans->r_itemq);
                                break;
                        }
@@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
+                        trace_xfs_log_recover_item_reorder_tail(log,
+                                                        trans, item, pass);
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
@@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1(
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLI_CANCEL))
+        if (!(flags & XFS_BLF_CANCEL)) {
+                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
                return;
+        }
        /*
         * Insert an xfs_buf_cancel record into the hash table of
@@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1(
        while (nextp != NULL) {
                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
                        nextp->bc_refcount++;
+                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
                        return;
                }
                prevp = nextp;
@@ -1640,13 +1689,14 @@ xlog_recover_do_buffer_pass1(
        bcp->bc_refcount = 1;
        bcp->bc_next = NULL;
        prevp->bc_next = bcp;
+        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
 }
 /*
 * Check to see whether the buffer being recovered has a corresponding
 * entry in the buffer cancel record table.  If it does then return 1
 * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
 * the refcount on the entry in the table and remove it from the table
 * if this is the last reference.
 *
@@ -1671,7 +1721,7 @@ xlog_check_buffer_cancelled(
                 * There is nothing in the table built in pass one,
                 * so this buffer must not be cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1683,7 +1733,7 @@ xlog_check_buffer_cancelled(
                 * There is no corresponding entry in the table built
                 * in pass one, so this buffer has not been cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1702,7 +1752,7 @@ xlog_check_buffer_cancelled(
                         * one in the table and remove it if this is the
                         * last reference.
                         */
-                        if (flags & XFS_BLI_CANCEL) {
+                        if (flags & XFS_BLF_CANCEL) {
                                bcp->bc_refcount--;
                                if (bcp->bc_refcount == 0) {
                                        if (prevp == NULL) {
@@ -1722,7 +1772,7 @@ xlog_check_buffer_cancelled(
         * We didn't find a corresponding entry in the table, so
         * return 0 so that the buffer is NOT cancelled.
         */
-        ASSERT(!(flags & XFS_BLI_CANCEL));
+        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
 }
@@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer(
        unsigned int            *data_map = NULL;
        unsigned int            map_size = 0;
+        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1822,8 +1874,8 @@ xlog_recover_do_inode_buffer(
                        nbits = xfs_contig_bits(data_map, map_size,
                                                         bit);
                        ASSERT(nbits > 0);
-                        reg_buf_offset = bit << XFS_BLI_SHIFT;
+                        reg_buf_offset = bit << XFS_BLF_SHIFT;
-                        reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
                        item_index++;
                }
@@ -1837,7 +1889,7 @@ xlog_recover_do_inode_buffer(
                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
-                ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
                ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
                /*
@@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+        struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
        xfs_buf_t               *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer(
        unsigned int            map_size = 0;
        int                     error;
+        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1900,9 +1955,9 @@ xlog_recover_do_reg_buffer(
                nbits = xfs_contig_bits(data_map, map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
-                ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
                ASSERT(XFS_BUF_COUNT(bp) >=
-                       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+                       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
                /*
                 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1911,7 +1966,7 @@ xlog_recover_do_reg_buffer(
                 */
                error = 0;
                if (buf_f->blf_flags &
-                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
                                cmn_err(CE_ALERT,
                                        "XFS: NULL dquot in %s.", __func__);
@@ -1932,9 +1987,9 @@ xlog_recover_do_reg_buffer(
                }
                memcpy(xfs_buf_offset(bp,
-                        (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                        (uint)bit << XFS_BLF_SHIFT),    /* dest */
                        item->ri_buf[i].i_addr,         /* source */
-                        nbits<<XFS_BLI_SHIFT);          /* length */
+                        nbits<<XFS_BLF_SHIFT);          /* length */
 next:
                i++;
                bit += nbits;
@@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer(
 {
        uint                    type;
+        trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
@@ -2091,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
        }
        type = 0;
-        if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
                type |= XFS_DQ_USER;
-        if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
                type |= XFS_DQ_PROJ;
-        if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
                type |= XFS_DQ_GROUP;
        /*
         * This type of quotas was turned off, so ignore this buffer
@@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer(
        if (log->l_quotaoffs_flag & type)
                return;
-        xlog_recover_do_reg_buffer(item, bp, buf_f);
+        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 /*
@@ -2116,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
 * here which overlaps that may be stale.
 *
 * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
 * of the buffer in the log should not be replayed at recovery time.
 * This is so that if the blocks covered by the buffer are reused for
 * file data before we crash we don't end up replaying old, freed
@@ -2150,7 +2207,7 @@ xlog_recover_do_buffer_trans(
        if (pass == XLOG_RECOVER_PASS1) {
                /*
                 * In this pass we're only looking for buf items
-                 * with the XFS_BLI_CANCEL bit set.
+                 * with the XFS_BLF_CANCEL bit set.
                 */
                xlog_recover_do_buffer_pass1(log, buf_f);
                return 0;
@@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans(
                 */
                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
                if (cancel) {
+                        trace_xfs_log_recover_buf_cancel(log, buf_f);
                        return 0;
                }
        }
+        trace_xfs_log_recover_buf_recover(log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                blkno = buf_f->blf_blkno;
@@ -2185,7 +2244,7 @@ xlog_recover_do_buffer_trans(
        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLI_INODE_BUF))
+        if (!(flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2198,13 +2257,13 @@ xlog_recover_do_buffer_trans(
        }
        error = 0;
-        if (flags & XFS_BLI_INODE_BUF) {
+        if (flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (flags &
-                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
-                xlog_recover_do_reg_buffer(item, bp, buf_f);
+                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
                return XFS_ERROR(error);
@@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans(
        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
                                        in_f->ilf_len, 0)) {
                error = 0;
+                trace_xfs_log_recover_inode_cancel(log, in_f);
                goto error;
        }
+        trace_xfs_log_recover_inode_recover(log, in_f);
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
                          XBF_LOCK);
@@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans(
                        /* do nothing */
                } else {
                        xfs_buf_relse(bp);
+                        trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
                        goto error;
                }
@@ -2758,11 +2820,12 @@ xlog_recover_do_trans(
        int                     error = 0;
        xlog_recover_item_t     *item;
-        error = xlog_recover_reorder_trans(trans);
+        error = xlog_recover_reorder_trans(log, trans, pass);
        if (error)
                return error;
        list_for_each_entry(item, &trans->r_itemq, ri_list) {
+                trace_xfs_log_recover_item_recover(log, trans, item, pass);
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
                        error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2919,8 +2982,9 @@ xlog_recover_process_data(
                                error = xlog_recover_unmount_trans(trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
-                                error = xlog_recover_add_to_cont_trans(trans,
+                                error = xlog_recover_add_to_cont_trans(log,
-                                                dp, be32_to_cpu(ohead->oh_len));
+                                                trans, dp,
+                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
                                xlog_warn(
@@ -2930,7 +2994,7 @@ xlog_recover_process_data(
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
-                                error = xlog_recover_add_to_trans(trans,
+                                error = xlog_recover_add_to_trans(log, trans,
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
@@ -3331,42 +3395,6 @@ xlog_pack_data(
        }
 }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-STATIC void
-xlog_unpack_data_checksum(
-        xlog_rec_header_t       *rhead,
-        xfs_caddr_t             dp,
-        xlog_t                  *log)
-{
-        __be32                  *up = (__be32 *)dp;
-        uint                    chksum = 0;
-        int                     i;
-        /* divide length by 4 to get # words */
-        for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
-                chksum ^= be32_to_cpu(*up);
-                up++;
-        }
-        if (chksum != be32_to_cpu(rhead->h_chksum)) {
-            if (rhead->h_chksum ||
-                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
-                    cmn_err(CE_DEBUG,
-                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
-                            be32_to_cpu(rhead->h_chksum), chksum);
-                    cmn_err(CE_DEBUG,
-"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
-                    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                            cmn_err(CE_DEBUG,
-                                "XFS: LogR this is a LogV2 filesystem\n");
-                    }
-                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
-            }
-        }
-}
-#else
-#define xlog_unpack_data_checksum(rhead, dp, log)
-#endif
 STATIC void
 xlog_unpack_data(
        xlog_rec_header_t       *rhead,
@@ -3390,8 +3418,6 @@ xlog_unpack_data(
                        dp += BBSIZE;
                }
        }
-        xlog_unpack_data_checksum(rhead, dp, log);
 }
 STATIC int
@@ -3490,7 +3516,7 @@ xlog_do_recovery_pass(
                        hblks = 1;
                }
        } else {
-                ASSERT(log->l_sectbb_log == 0);
+                ASSERT(log->l_sectBBsize == 1);
                hblks = 1;
                hbp = xlog_get_bp(log, 1);
                h_size = XLOG_BIG_RECORD_BSIZE;
@@ -3946,10 +3972,6 @@ xlog_recover_check_summary(
        xfs_agf_t       *agfp;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *sbbp;
-#ifdef XFS_LOUD_RECOVERY
-        xfs_sb_t        *sbp;
-#endif
        xfs_agnumber_t  agno;
        __uint64_t      freeblks;
        __uint64_t      itotal;
@@ -3984,30 +4006,5 @@ xlog_recover_check_summary(
                        xfs_buf_relse(agibp);
                }
        }
-        sbbp = xfs_getsb(mp, 0);
-#ifdef XFS_LOUD_RECOVERY
-        sbp = &mp->m_sb;
-        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
-                sbp->sb_icount, itotal);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
-                sbp->sb_ifree, ifree);
-        cmn_err(CE_NOTE,
-                "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
-                sbp->sb_fdblocks, freeblks);
-#if 0
-        /*
-         * This is turned off until I account for the allocation
-         * btree blocks which live in free space.
-         */
-        ASSERT(sbp->sb_icount == itotal);
-        ASSERT(sbp->sb_ifree == ifree);
-        ASSERT(sbp->sb_fdblocks == freeblks);
-#endif
-#endif
-        xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
 #define XLOG_RHASH(tid) \
        ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e79b56b4bca6..d7bf38c8cd1c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1405,13 +1405,6 @@ xfs_mountfs(
                xfs_qm_mount_quotas(mp);
        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        if (XFS_IS_QUOTA_ON(mp))
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-        else
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-#endif
        /*
         * Now we are mounted, reserve a small amount of unused space for
         * privileged transactions. This is needed so that transaction
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC         (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
+#define XFS_MOUNT_DELAYLOG      (1ULL << 1)     /* delayed logging is enabled */
 #define XFS_MOUNT_DMAPI         (1ULL << 2)     /* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index fdcab3f81dde..e0e64b113bd6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_FORCE_RES     0x0000010 /* ignore quota limits */
 #define XFS_QMOPT_DQSUSER       0x0000020 /* don't cache super users dquot */
 #define XFS_QMOPT_SBVERSION     0x0000040 /* change superblock version num */
-#define XFS_QMOPT_QUOTAOFF      0x0000080 /* quotas are being turned off */
-#define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
-#define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f73e358bae8d..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,24 +44,14 @@
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
-STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
-STATIC uint     xfs_trans_count_vecs(xfs_trans_t *);
-STATIC void     xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
-STATIC void     xfs_trans_uncommit(xfs_trans_t *, uint);
-STATIC void     xfs_trans_committed(xfs_trans_t *, int);
-STATIC void     xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
-STATIC void     xfs_trans_free(xfs_trans_t *);
 kmem_zone_t     *xfs_trans_zone;
 /*
 * Reservation functions here avoid a huge stack in xfs_trans_init
 * due to register overflow from temporaries in the calculations.
 */
 STATIC uint
 xfs_calc_write_reservation(xfs_mount_t *mp)
 {
@@ -254,13 +244,30 @@ _xfs_trans_alloc(
        tp->t_type = type;
        tp->t_mountp = mp;
        tp->t_items_free = XFS_LIC_NUM_SLOTS;
-        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
        xfs_lic_init(&(tp->t_items));
-        XFS_LBC_INIT(&(tp->t_busy));
+        INIT_LIST_HEAD(&tp->t_busy);
        return tp;
 }
 /*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+        struct xfs_trans        *tp)
+{
+        struct xfs_busy_extent  *busyp, *n;
+        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+                xfs_alloc_busy_clear(tp->t_mountp, busyp);
+        atomic_dec(&tp->t_mountp->m_active_trans);
+        xfs_trans_free_dqinfo(tp);
+        kmem_zone_free(xfs_trans_zone, tp);
+}
+/*
 * This is called to create a new transaction which will share the
 * permanent log reservation of the given transaction.  The remaining
 * unused block and rt extent reservations are also inherited.  This
@@ -283,9 +290,8 @@ xfs_trans_dup(
        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
-        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
        xfs_lic_init(&(ntp->t_items));
-        XFS_LBC_INIT(&(ntp->t_busy));
+        INIT_LIST_HEAD(&ntp->t_busy);
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(tp->t_ticket != NULL);
@@ -421,7 +427,6 @@ undo_blocks:
        return error;
 }
 /*
 * Record the indicated change to the given field for application
 * to the file system's superblock when the transaction commits.
@@ -650,7 +655,7 @@ xfs_trans_apply_sb_deltas(
 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
 * still need to update the incore superblock with the changes.
 */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
@@ -764,94 +769,256 @@ xfs_trans_unreserve_and_mod_sb(
        }
 }
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+static uint
+xfs_trans_count_vecs(
+        struct xfs_trans        *tp)
+{
+        int                     nvecs;
+        xfs_log_item_desc_t     *lidp;
+        nvecs = 1;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp != NULL);
+        /* In the non-debug case we need to start bailing out if we
+         * didn't find a log_item here, return zero and let trans_commit
+         * deal with it.
+         */
+        if (lidp == NULL)
+                return 0;
+        while (lidp != NULL) {
+                /*
+                 * Skip items which aren't dirty in this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                lidp->lid_size = IOP_SIZE(lidp->lid_item);
+                nvecs += lidp->lid_size;
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        return nvecs;
+}
 /*
- * xfs_trans_commit
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
 *
- * Commit the given transaction to the log a/synchronously.
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+static void
+xfs_trans_fill_vecs(
+        struct xfs_trans        *tp,
+        struct xfs_log_iovec    *log_vector)
+{
+        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_iovec    *vecp;
+        uint                    nitems;
+        /*
+         * Skip over the entry for the transaction header, we'll
+         * fill that in at the end.
+         */
+        vecp = log_vector + 1;
+        nitems = 0;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp);
+        while (lidp) {
+                /* Skip items which aren't dirty in this transaction. */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                /*
+                 * The item may be marked dirty but not log anything.  This can
+                 * be used to get called when a transaction is committed.
+                 */
+                if (lidp->lid_size)
+                        nitems++;
+                IOP_FORMAT(lidp->lid_item, vecp);
+                vecp += lidp->lid_size;
+                IOP_PIN(lidp->lid_item);
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        /*
+         * Now that we've counted the number of items in this transaction, fill
+         * in the transaction header. Note that the transaction header does not
+         * have a log item.
+         */
+        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        tp->t_header.th_type = tp->t_type;
+        tp->t_header.th_num_items = nitems;
+        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+        log_vector->i_len = sizeof(xfs_trans_header_t);
+        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+}
+/*
+ * The committed item processing consists of calling the committed routine of
+ * each logged item, updating the item's position in the AIL if necessary, and
+ * unpinning each item.  If the committed routine returns -1, then do nothing
+ * further with the item because it may have been freed.
 *
- * XFS disk error handling mechanism is not based on a typical
+ * Since items are unlocked when they are copied to the incore log, it is
- * transaction abort mechanism. Logically after the filesystem
+ * possible for two transactions to be completing and manipulating the same
- * gets marked 'SHUTDOWN', we can't let any new transactions
+ * item simultaneously.  The AIL lock will protect the lsn field of each item.
- * be durable - ie. committed to disk - because some metadata might
+ * The value of this field can never go backwards.
- * be inconsistent. In such cases, this returns an error, and the
+ *
- * caller may assume that all locked objects joined to the transaction
+ * We unpin the items after repositioning them in the AIL, because otherwise
- * have already been unlocked as if the commit had succeeded.
+ * they could be immediately flushed and we'd have to race with the flusher
- * Do not reference the transaction structure after this call.
+ * trying to pull the item from the AIL as we add it.
 */
- /*ARGSUSED*/
+void
-int
+xfs_trans_item_committed(
-_xfs_trans_commit(
+        struct xfs_log_item     *lip,
-        xfs_trans_t     *tp,
+        xfs_lsn_t               commit_lsn,
-        uint            flags,
+        int                     aborted)
-        int             *log_flushed)
 {
-        xfs_log_iovec_t         *log_vector;
+        xfs_lsn_t               item_lsn;
-        int                     nvec;
+        struct xfs_ail          *ailp;
-        xfs_mount_t             *mp;
-        xfs_lsn_t               commit_lsn;
-        /* REFERENCED */
-        int                     error;
-        int                     log_flags;
-        int                     sync;
-#define XFS_TRANS_LOGVEC_COUNT  16
-        xfs_log_iovec_t         log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
-        struct xlog_in_core     *commit_iclog;
-        int                     shutdown;
-        commit_lsn = -1;
+        if (aborted)
+                lip->li_flags |= XFS_LI_ABORTED;
+        item_lsn = IOP_COMMITTED(lip, commit_lsn);
+        /* If the committed routine returns -1, item has been freed. */
+        if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
+                return;
        /*
-         * Determine whether this commit is releasing a permanent
+         * If the returned lsn is greater than what it contained before, update
-         * log reservation or not.
+         * the location of the item in the AIL.  If it is not, then do nothing.
+         * Items can never move backwards in the AIL.
+         *
+         * While the new lsn should usually be greater, it is possible that a
+         * later transaction completing simultaneously with an earlier one
+         * using the same item could complete first with a higher lsn.  This
+         * would cause the earlier transaction to fail the test below.
         */
-        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+        ailp = lip->li_ailp;
-                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+        spin_lock(&ailp->xa_lock);
-                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+                /*
+                 * This will set the item's lsn to item_lsn and update the
+                 * position of the item in the AIL.
+                 *
+                 * xfs_trans_ail_update() drops the AIL lock.
+                 */
+                xfs_trans_ail_update(ailp, lip, item_lsn);
        } else {
-                log_flags = 0;
+                spin_unlock(&ailp->xa_lock);
        }
-        mp = tp->t_mountp;
        /*
-         * If there is nothing to be logged by the transaction,
+         * Now that we've repositioned the item in the AIL, unpin it so it can
-         * then unlock all of the items associated with the
+         * be flushed. Pass information about buffer stale state down from the
-         * transaction and free the transaction structure.
+         * log item flags, if anyone else stales the buffer we do not want to
-         * Also make sure to return any reserved blocks to
+         * pay any attention to it.
-         * the free pool.
         */
-shut_us_down:
+        IOP_UNPIN(lip);
-        shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+}
-        if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
-                xfs_trans_unreserve_and_mod_sb(tp);
+/*
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ *
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ */
+STATIC void
+xfs_trans_committed(
+        struct xfs_trans        *tp,
+        int                     abortflag)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    *next_licp;
+        /* Call the transaction's completion callback if there is one. */
+        if (tp->t_callback != NULL)
+                tp->t_callback(tp, tp->t_callarg);
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
+                xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag);
+        }
+        /* free the item chunks, ignoring the embedded chunk */
+        for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) {
+                next_licp = licp->lic_next;
+                kmem_free(licp);
+        }
+        xfs_trans_free(tp);
+}
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+        struct xfs_trans        *tp,
+        uint                    flags)
+{
+        xfs_log_item_desc_t     *lidp;
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
                /*
-                 * It is indeed possible for the transaction to be
+                 * Unpin all but those that aren't dirty.
-                 * not dirty but the dqinfo portion to be. All that
-                 * means is that we have some (non-persistent) quota
-                 * reservations that need to be unreserved.
                 */
-                xfs_trans_unreserve_and_mod_dquots(tp);
+                if (lidp->lid_flags & XFS_LID_DIRTY)
-                if (tp->t_ticket) {
+                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
-                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
-                                                        NULL, log_flags);
-                        if (commit_lsn == -1 && !shutdown)
-                                shutdown = XFS_ERROR(EIO);
-                }
-                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-                xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
-                xfs_trans_free_busy(tp);
-                xfs_trans_free(tp);
-                XFS_STATS_INC(xs_trans_empty);
-                return (shutdown);
        }
-        ASSERT(tp->t_ticket != NULL);
-        /*
+        xfs_trans_unreserve_and_mod_sb(tp);
-         * If we need to update the superblock, then do it now.
+        xfs_trans_unreserve_and_mod_dquots(tp);
-         */
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_free(tp);
-        xfs_trans_apply_dquot_deltas(tp);
+}
+/*
+ * Format the transaction direct to the iclog. This isolates the physical
+ * transaction commit operation from the logical operation and hence allows
+ * other methods to be introduced without affecting the existing commit path.
+ */
+static int
+xfs_trans_commit_iclog(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        int                     shutdown;
+        int                     error;
+        int                     log_flags = 0;
+        struct xlog_in_core     *commit_iclog;
+#define XFS_TRANS_LOGVEC_COUNT  16
+        struct xfs_log_iovec    log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+        struct xfs_log_iovec    *log_vector;
+        uint                    nvec;
        /*
         * Ask each log item how many log_vector entries it will
@@ -861,8 +1028,7 @@ shut_us_down:
         */
        nvec = xfs_trans_count_vecs(tp);
        if (nvec == 0) {
-                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                return ENOMEM;  /* triggers a shutdown! */
-                goto shut_us_down;
        } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
                log_vector = log_vector_fast;
        } else {
@@ -877,6 +1043,9 @@ shut_us_down:
         */
        xfs_trans_fill_vecs(tp, log_vector);
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
        error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn));
        /*
@@ -884,18 +1053,19 @@ shut_us_down:
         * at any time after this call.  However, all the items associated
         * with the transaction are still locked and pinned in memory.
         */
-        commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+        *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
-        tp->t_commit_lsn = commit_lsn;
+        tp->t_commit_lsn = *commit_lsn;
-        if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+        trace_xfs_trans_commit_lsn(tp);
+        if (nvec > XFS_TRANS_LOGVEC_COUNT)
                kmem_free(log_vector);
-        }
        /*
         * If we got a log write error. Unpin the logitems that we
         * had pinned, clean up, free trans structure, and return error.
         */
-        if (error || commit_lsn == -1) {
+        if (error || *commit_lsn == -1) {
                current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
                return XFS_ERROR(EIO);
@@ -909,8 +1079,6 @@ shut_us_down:
         */
        xfs_trans_unreserve_and_mod_sb(tp);
-        sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
         * Tell the LM to call the transaction completion routine
         * when the log write with LSN commit_lsn completes (e.g.
@@ -953,7 +1121,7 @@ shut_us_down:
         * the commit lsn of this transaction for dependency tracking
         * purposes.
         */
-        xfs_trans_unlock_items(tp, commit_lsn);
+        xfs_trans_unlock_items(tp, *commit_lsn);
        /*
         * If we detected a log error earlier, finish committing
@@ -973,156 +1141,204 @@ shut_us_down:
         * and the items are released we can finally allow the iclog to
         * go to disk.
         */
-        error = xfs_log_release_iclog(mp, commit_iclog);
+        return xfs_log_release_iclog(mp, commit_iclog);
-        /*
-         * If the transaction needs to be synchronous, then force the
-         * log out now and wait for it.
-         */
-        if (sync) {
-                if (!error) {
-                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_SYNC, log_flushed);
-                }
-                XFS_STATS_INC(xs_trans_sync);
-        } else {
-                XFS_STATS_INC(xs_trans_async);
-        }
-        return (error);
 }
 /*
- * Total up the number of log iovecs needed to commit this
+ * Walk the log items and allocate log vector structures for
- * transaction.  The transaction itself needs one for the
+ * each item large enough to fit all the vectors they require.
- * transaction header.  Ask each dirty item in turn how many
+ * Note that this format differs from the old log vector format in
- * it needs to get the total.
+ * that there is no transaction header in these log vectors.
 */
-STATIC uint
+STATIC struct xfs_log_vec *
-xfs_trans_count_vecs(
+xfs_trans_alloc_log_vecs(
        xfs_trans_t     *tp)
 {
-        int                     nvecs;
        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *lv = NULL;
+        struct xfs_log_vec      *ret_lv = NULL;
-        nvecs = 1;
        lidp = xfs_trans_first_item(tp);
-        ASSERT(lidp != NULL);
-        /* In the non-debug case we need to start bailing out if we
+        /* Bail out if we didn't find a log item.  */
-         * didn't find a log_item here, return zero and let trans_commit
+        if (!lidp) {
-         * deal with it.
+                ASSERT(0);
-         */
+                return NULL;
-        if (lidp == NULL)
+        }
-                return 0;
        while (lidp != NULL) {
-                /*
+                struct xfs_log_vec *new_lv;
-                 * Skip items which aren't dirty in this transaction.
-                 */
+                /* Skip items which aren't dirty in this transaction. */
                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
                        lidp = xfs_trans_next_item(tp, lidp);
                        continue;
                }
+                /* Skip items that do not have any vectors for writing */
                lidp->lid_size = IOP_SIZE(lidp->lid_item);
-                nvecs += lidp->lid_size;
+                if (!lidp->lid_size) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                new_lv = kmem_zalloc(sizeof(*new_lv) +
+                                lidp->lid_size * sizeof(struct xfs_log_iovec),
+                                KM_SLEEP);
+                /* The allocated iovec region lies beyond the log vector. */
+                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+                new_lv->lv_niovecs = lidp->lid_size;
+                new_lv->lv_item = lidp->lid_item;
+                if (!ret_lv)
+                        ret_lv = new_lv;
+                else
+                        lv->lv_next = new_lv;
+                lv = new_lv;
                lidp = xfs_trans_next_item(tp, lidp);
        }
-        return nvecs;
+        return ret_lv;
 }
-/*
+static int
- * Called from the trans_commit code when we notice that
+xfs_trans_commit_cil(
- * the filesystem is in the middle of a forced shutdown.
+        struct xfs_mount        *mp,
- */
+        struct xfs_trans        *tp,
-STATIC void
+        xfs_lsn_t               *commit_lsn,
-xfs_trans_uncommit(
+        int                     flags)
-        xfs_trans_t     *tp,
-        uint            flags)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *log_vector;
+        int                     error;
-        for (lidp = xfs_trans_first_item(tp);
+        /*
-             lidp != NULL;
+         * Get each log item to allocate a vector structure for
-             lidp = xfs_trans_next_item(tp, lidp)) {
+         * the log item to to pass to the log write code. The
-                /*
+         * CIL commit code will format the vector and save it away.
-                 * Unpin all but those that aren't dirty.
+         */
-                 */
+        log_vector = xfs_trans_alloc_log_vecs(tp);
-                if (lidp->lid_flags & XFS_LID_DIRTY)
+        if (!log_vector)
-                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+                return ENOMEM;
-        }
-        xfs_trans_unreserve_and_mod_sb(tp);
+        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-        xfs_trans_unreserve_and_mod_dquots(tp);
+        if (error)
+                return error;
-        xfs_trans_free_items(tp, flags);
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_busy(tp);
+        /* xfs_trans_free_items() unlocks them first */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
        xfs_trans_free(tp);
+        return 0;
 }
 /*
- * Fill in the vector with pointers to data to be logged
+ * xfs_trans_commit
- * by this transaction.  The transaction header takes
- * the first vector, and then each dirty item takes the
- * number of vectors it indicated it needed in xfs_trans_count_vecs().
 *
- * As each item fills in the entries it needs, also pin the item
+ * Commit the given transaction to the log a/synchronously.
- * so that it cannot be flushed out until the log write completes.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
 */
-STATIC void
+int
-xfs_trans_fill_vecs(
+_xfs_trans_commit(
-        xfs_trans_t             *tp,
+        struct xfs_trans        *tp,
-        xfs_log_iovec_t         *log_vector)
+        uint                    flags,
+        int                     *log_flushed)
 {
-        xfs_log_item_desc_t     *lidp;
+        struct xfs_mount        *mp = tp->t_mountp;
-        xfs_log_iovec_t         *vecp;
+        xfs_lsn_t               commit_lsn = -1;
-        uint                    nitems;
+        int                     error = 0;
+        int                     log_flags = 0;
+        int                     sync = tp->t_flags & XFS_TRANS_SYNC;
        /*
-         * Skip over the entry for the transaction header, we'll
+         * Determine whether this commit is releasing a permanent
-         * fill that in at the end.
+         * log reservation or not.
         */
-        vecp = log_vector + 1;          /* pointer arithmetic */
+        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        }
-        nitems = 0;
+        /*
-        lidp = xfs_trans_first_item(tp);
+         * If there is nothing to be logged by the transaction,
-        ASSERT(lidp != NULL);
+         * then unlock all of the items associated with the
-        while (lidp != NULL) {
+         * transaction and free the transaction structure.
-                /*
+         * Also make sure to return any reserved blocks to
-                 * Skip items which aren't dirty in this transaction.
+         * the free pool.
-                 */
+         */
-                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+        if (!(tp->t_flags & XFS_TRANS_DIRTY))
-                        lidp = xfs_trans_next_item(tp, lidp);
+                goto out_unreserve;
-                        continue;
-                }
+        if (XFS_FORCED_SHUTDOWN(mp)) {
-                /*
+                error = XFS_ERROR(EIO);
-                 * The item may be marked dirty but not log anything.
+                goto out_unreserve;
-                 * This can be used to get called when a transaction
+        }
-                 * is committed.
-                 */
+        ASSERT(tp->t_ticket != NULL);
-                if (lidp->lid_size) {
-                        nitems++;
+        /*
+         * If we need to update the superblock, then do it now.
+         */
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
+                xfs_trans_apply_sb_deltas(tp);
+        xfs_trans_apply_dquot_deltas(tp);
+        if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+                error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+        else
+                error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+        if (error == ENOMEM) {
+                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+                error = XFS_ERROR(EIO);
+                goto out_unreserve;
+        }
+        /*
+         * If the transaction needs to be synchronous, then force the
+         * log out now and wait for it.
+         */
+        if (sync) {
+                if (!error) {
+                        error = _xfs_log_force_lsn(mp, commit_lsn,
+                                      XFS_LOG_SYNC, log_flushed);
                }
-                IOP_FORMAT(lidp->lid_item, vecp);
+                XFS_STATS_INC(xs_trans_sync);
-                vecp += lidp->lid_size;         /* pointer arithmetic */
+        } else {
-                IOP_PIN(lidp->lid_item);
+                XFS_STATS_INC(xs_trans_async);
-                lidp = xfs_trans_next_item(tp, lidp);
        }
+        return error;
+out_unreserve:
+        xfs_trans_unreserve_and_mod_sb(tp);
        /*
-         * Now that we've counted the number of items in this
+         * It is indeed possible for the transaction to be not dirty but
-         * transaction, fill in the transaction header.
+         * the dqinfo portion to be.  All that means is that we have some
+         * (non-persistent) quota reservations that need to be unreserved.
         */
-        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        xfs_trans_unreserve_and_mod_dquots(tp);
-        tp->t_header.th_type = tp->t_type;
+        if (tp->t_ticket) {
-        tp->t_header.th_num_items = nitems;
+                commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+                if (commit_lsn == -1 && !error)
-        log_vector->i_len = sizeof(xfs_trans_header_t);
+                        error = XFS_ERROR(EIO);
-        log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
+        }
-}
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+        xfs_trans_free(tp);
+        XFS_STATS_INC(xs_trans_empty);
+        return error;
+}
 /*
 * Unlock all of the transaction's items and free the transaction.
@@ -1195,25 +1411,10 @@ xfs_trans_cancel(
        /* mark this thread as no longer being in a transaction */
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
 }
-/*
- * Free the transaction structure.  If there is more clean up
- * to do when the structure is freed, add it here.
- */
-STATIC void
-xfs_trans_free(
-        xfs_trans_t     *tp)
-{
-        atomic_dec(&tp->t_mountp->m_active_trans);
-        xfs_trans_free_dqinfo(tp);
-        kmem_zone_free(xfs_trans_zone, tp);
-}
 /*
 * Roll from one trans in the sequence of PERMANENT transactions to
 * the next: permanent transactions are only flushed out when
@@ -1283,174 +1484,3 @@ xfs_trans_roll(
        xfs_trans_ihold(trans, dp);
        return 0;
 }
-/*
- * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
- *
- * This is typically called by the LM when a transaction has been fully
- * committed to disk.  It needs to unpin the items which have
- * been logged by the transaction and update their positions
- * in the AIL if necessary.
- * This also gets called when the transactions didn't get written out
- * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
- *
- * Call xfs_trans_chunk_committed() to process the items in
- * each chunk.
- */
-STATIC void
-xfs_trans_committed(
-        xfs_trans_t     *tp,
-        int             abortflag)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_chunk_t    *next_licp;
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i;
-        /*
-         * Call the transaction's completion callback if there
-         * is one.
-         */
-        if (tp->t_callback != NULL) {
-                tp->t_callback(tp, tp->t_callarg);
-        }
-        /*
-         * Special case the chunk embedded in the transaction.
-         */
-        licp = &(tp->t_items);
-        if (!(xfs_lic_are_all_free(licp))) {
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-        }
-        /*
-         * Process the items in each chunk in turn.
-         */
-        licp = licp->lic_next;
-        while (licp != NULL) {
-                ASSERT(!xfs_lic_are_all_free(licp));
-                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
-                next_licp = licp->lic_next;
-                kmem_free(licp);
-                licp = next_licp;
-        }
-        /*
-         * Clear all the per-AG busy list items listed in this transaction
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-                        if (!XFS_LBC_ISFREE(lbcp, i)) {
-                                xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
-                                                     lbsp->lbc_idx);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        xfs_trans_free_busy(tp);
-        /*
-         * That's it for the transaction structure.  Free it.
-         */
-        xfs_trans_free(tp);
-}
-/*
- * This is called to perform the commit processing for each
- * item described by the given chunk.
- *
- * The commit processing consists of unlocking items which were
- * held locked with the SYNC_UNLOCK attribute, calling the committed
- * routine of each logged item, updating the item's position in the AIL
- * if necessary, and unpinning each item.  If the committed routine
- * returns -1, then do nothing further with the item because it
- * may have been freed.
- *
- * Since items are unlocked when they are copied to the incore
- * log, it is possible for two transactions to be completing
- * and manipulating the same item simultaneously.  The AIL lock
- * will protect the lsn field of each item.  The value of this
- * field can never go backwards.
- *
- * We unpin the items after repositioning them in the AIL, because
- * otherwise they could be immediately flushed and we'd have to race
- * with the flusher trying to pull the item from the AIL as we add it.
- */
-STATIC void
-xfs_trans_chunk_committed(
-        xfs_log_item_chunk_t    *licp,
-        xfs_lsn_t               lsn,
-        int                     aborted)
-{
-        xfs_log_item_desc_t     *lidp;
-        xfs_log_item_t          *lip;
-        xfs_lsn_t               item_lsn;
-        int                     i;
-        lidp = licp->lic_descs;
-        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                struct xfs_ail          *ailp;
-                if (xfs_lic_isfree(licp, i)) {
-                        continue;
-                }
-                lip = lidp->lid_item;
-                if (aborted)
-                        lip->li_flags |= XFS_LI_ABORTED;
-                /*
-                 * Send in the ABORTED flag to the COMMITTED routine
-                 * so that it knows whether the transaction was aborted
-                 * or not.
-                 */
-                item_lsn = IOP_COMMITTED(lip, lsn);
-                /*
-                 * If the committed routine returns -1, make
-                 * no more references to the item.
-                 */
-                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
-                        continue;
-                }
-                /*
-                 * If the returned lsn is greater than what it
-                 * contained before, update the location of the
-                 * item in the AIL.  If it is not, then do nothing.
-                 * Items can never move backwards in the AIL.
-                 *
-                 * While the new lsn should usually be greater, it
-                 * is possible that a later transaction completing
-                 * simultaneously with an earlier one using the
-                 * same item could complete first with a higher lsn.
-                 * This would cause the earlier transaction to fail
-                 * the test below.
-                 */
-                ailp = lip->li_ailp;
-                spin_lock(&ailp->xa_lock);
-                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
-                        /*
-                         * This will set the item's lsn to item_lsn
-                         * and update the position of the item in
-                         * the AIL.
-                         *
-                         * xfs_trans_ail_update() drops the AIL lock.
-                         */
-                        xfs_trans_ail_update(ailp, lip, item_lsn);
-                } else {
-                        spin_unlock(&ailp->xa_lock);
-                }
-                /*
-                 * Now that we've repositioned the item in the AIL,
-                 * unpin it so it can be flushed. Pass information
-                 * about buffer stale state down from the log item
-                 * flags, if anyone else stales the buffer we do not
-                 * want to pay any attention to it.
-                 */
-                IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
-        }
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 79c8bab9dfff..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -49,6 +49,15 @@ typedef struct xfs_trans_header {
 #define XFS_LI_DQUOT            0x123d
 #define XFS_LI_QUOTAOFF         0x123e
+#define XFS_LI_TYPE_DESC \
+        { XFS_LI_EFI,           "XFS_LI_EFI" }, \
+        { XFS_LI_EFD,           "XFS_LI_EFD" }, \
+        { XFS_LI_IUNLINK,       "XFS_LI_IUNLINK" }, \
+        { XFS_LI_INODE,         "XFS_LI_INODE" }, \
+        { XFS_LI_BUF,           "XFS_LI_BUF" }, \
+        { XFS_LI_DQUOT,         "XFS_LI_DQUOT" }, \
+        { XFS_LI_QUOTAOFF,      "XFS_LI_QUOTAOFF" }
 /*
 * Transaction types.  Used to distinguish types of buffers.
 */
@@ -97,7 +106,8 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFSRT_FREE         39
 #define XFS_TRANS_SWAPEXT               40
 #define XFS_TRANS_SB_COUNT              41
-#define XFS_TRANS_TYPE_MAX              41
+#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_TYPE_MAX              42
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -139,6 +149,7 @@ typedef struct xfs_trans_header {
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
@@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc {
 #define XFS_LID_DIRTY           0x1
 #define XFS_LID_PINNED          0x2
-#define XFS_LID_BUF_STALE       0x8
 /*
 * This structure is used to maintain a chunk list of log_item_desc
@@ -805,6 +815,7 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
+struct xfs_busy_extent;
 typedef struct xfs_log_item {
        struct list_head                li_ail;         /* AIL pointers */
@@ -820,6 +831,11 @@ typedef struct xfs_log_item {
                                                        /* buffer item iodone */
                                                        /* callback func */
        struct xfs_item_ops             *li_ops;        /* function list */
+        /* delayed logging */
+        struct list_head                li_cil;         /* CIL pointers */
+        struct xfs_log_vec              *li_lv;         /* active log vector */
+        xfs_lsn_t                       li_seq;         /* CIL commit seq */
 } xfs_log_item_t;
 #define XFS_LI_IN_AIL   0x1
@@ -833,7 +849,7 @@ typedef struct xfs_item_ops {
        uint (*iop_size)(xfs_log_item_t *);
        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin)(xfs_log_item_t *);
        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
        uint (*iop_trylock)(xfs_log_item_t *);
        void (*iop_unlock)(xfs_log_item_t *);
@@ -846,7 +862,7 @@ typedef struct xfs_item_ops {
 #define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
 #define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
 #define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN(ip)           (*(ip)->li_ops->iop_unpin)(ip)
 #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
 #define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
 #define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
@@ -864,34 +880,6 @@ typedef struct xfs_item_ops {
 #define XFS_ITEM_PUSHBUF        3
 /*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
 * This is the type of function which can be given to xfs_trans_callback()
 * to be called upon the transaction's commit to disk.
 */
@@ -942,8 +930,7 @@ typedef struct xfs_trans {
        unsigned int            t_items_free;   /* log item descs free */
        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
+        struct list_head        t_busy;         /* list of busy extents */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
        unsigned long           t_pflags;       /* saved process flags state */
 } xfs_trans_t;
@@ -1017,9 +1004,6 @@ int		_xfs_trans_commit(xfs_trans_t *,
 void            xfs_trans_cancel(xfs_trans_t *, int);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
-                                        xfs_agnumber_t ag,
-                                        xfs_extlen_t idx);
 extern kmem_zone_t      *xfs_trans_zone;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index fb586360d1c9..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,11 +40,51 @@
 #include "xfs_rw.h"
 #include "xfs_trace.h"
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.
+ */
+STATIC struct xfs_buf *
+xfs_trans_buf_item_match(
+        struct xfs_trans        *tp,
+        struct xfs_buftarg      *target,
+        xfs_daddr_t             blkno,
+        int                     len)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_desc_t     *lidp;
+        xfs_buf_log_item_t      *blip;
+        int                     i;
+        len = BBTOB(len);
+        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+                if (xfs_lic_are_all_free(licp)) {
+                        ASSERT(licp == &tp->t_items);
+                        ASSERT(licp->lic_next == NULL);
+                        return NULL;
+                }
+                for (i = 0; i < licp->lic_unused; i++) {
+                        /*
+                         * Skip unoccupied slots.
+                         */
+                        if (xfs_lic_isfree(licp, i))
+                                continue;
+                        lidp = xfs_lic_slot(licp, i);
+                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
+                        if (blip->bli_item.li_type != XFS_LI_BUF)
+                                continue;
+                        if (XFS_BUF_TARGET(blip->bli_buf) == target &&
+                            XFS_BUF_ADDR(blip->bli_buf) == blkno &&
+                            XFS_BUF_COUNT(blip->bli_buf) == len)
+                                return blip->bli_buf;
+                }
+        }
-STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+        return NULL;
-                xfs_daddr_t, int);
+}
-STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
-                xfs_daddr_t, int);
 /*
 * Add the locked buffer to the transaction.
@@ -74,7 +114,7 @@ _xfs_trans_bjoin(
        xfs_buf_item_init(bp, tp->t_mountp);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
        if (reset_recur)
                bip->bli_recur = 0;
@@ -112,14 +152,6 @@ xfs_trans_bjoin(
 * within the transaction, just increment its lock recursion count
 * and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use get_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * get_buf() call.
 */
@@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
         * have it locked.  In this case we just increment the lock
         * recursion count and return the buffer to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
-        } else {
-                bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                if (XFS_FORCED_SHUTDOWN(tp->t_mountp))
@@ -259,14 +287,6 @@ int	xfs_error_mod = 33;
 * within the transaction and already read in, just increment its
 * lock recursion count and return a pointer to it.
 *
- * Use the fast path function xfs_trans_buf_item_match() or the buffer
- * cache routine incore_match() to find the buffer
- * if it is already owned by this transaction.
- *
- * If we don't already own the buffer, use read_buf() to get it.
- * If it doesn't yet have an associated xfs_buf_log_item structure,
- * then allocate one and add the item to this transaction.
- *
 * If the transaction pointer is NULL, make this just a normal
 * read_buf() call.
 */
@@ -328,11 +348,7 @@ xfs_trans_read_buf(
         * If the buffer is not yet read in, then we read it in, increment
         * the lock recursion count, and return it to the caller.
         */
-        if (tp->t_items.lic_next == NULL) {
+        bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-                bp = xfs_trans_buf_item_match(tp, target, blkno, len);
-        } else {
-                bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
-        }
        if (bp != NULL) {
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
@@ -495,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        /*
@@ -603,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_HOLD;
        trace_xfs_trans_bhold(bip);
@@ -625,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
        bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -688,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                bip->bli_flags &= ~XFS_BLI_STALE;
                ASSERT(XFS_BUF_ISSTALE(bp));
                XFS_BUF_UNSTALE(bp);
-                bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+                bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
        tp->t_flags |= XFS_TRANS_DIRTY;
        lidp->lid_flags |= XFS_LID_DIRTY;
-        lidp->lid_flags &= ~XFS_LID_BUF_STALE;
        bip->bli_flags |= XFS_BLI_LOGGED;
        xfs_buf_item_log(bip, first, last);
 }
@@ -747,8 +762,8 @@ xfs_trans_binval(
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-                ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+                ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
                return;
@@ -759,7 +774,7 @@ xfs_trans_binval(
         * in the buf log item.  The STALE flag will be used in
         * xfs_buf_item_unpin() to determine if it should clean up
         * when the last reference to the buf item is given up.
-         * We set the XFS_BLI_CANCEL flag in the buf log format structure
+         * We set the XFS_BLF_CANCEL flag in the buf log format structure
         * and log the buf item.  This will be used at recovery time
         * to determine that copies of the buffer in the log before
         * this should not be replayed.
@@ -777,26 +792,26 @@ xfs_trans_binval(
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_STALE(bp);
        bip->bli_flags |= XFS_BLI_STALE;
-        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+        bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-        bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+        bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
-        bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+        bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
        memset((char *)(bip->bli_format.blf_data_map), 0,
              (bip->bli_format.blf_map_size * sizeof(uint)));
-        lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+        lidp->lid_flags |= XFS_LID_DIRTY;
        tp->t_flags |= XFS_TRANS_DIRTY;
 }
 /*
- * This call is used to indicate that the buffer contains on-disk
+ * This call is used to indicate that the buffer contains on-disk inodes which
- * inodes which must be handled specially during recovery.  They
+ * must be handled specially during recovery.  They require special handling
- * require special handling because only the di_next_unlinked from
+ * because only the di_next_unlinked from the inodes in the buffer should be
- * the inodes in the buffer should be recovered.  The rest of the
+ * recovered.  The rest of the data in the buffer is logged via the inodes
- * data in the buffer is logged via the inodes themselves.
+ * themselves.
 *
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
- * format structure so that we'll know what to do at recovery time.
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
 */
-/* ARGSUSED */
 void
 xfs_trans_inode_buf(
        xfs_trans_t     *tp,
@@ -811,7 +826,7 @@ xfs_trans_inode_buf(
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+        bip->bli_flags |= XFS_BLI_INODE_BUF;
 }
 /*
@@ -893,120 +908,12 @@ xfs_trans_dquot_buf(
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-        ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+        ASSERT(type == XFS_BLF_UDQUOT_BUF ||
-               type == XFS_BLI_PDQUOT_BUF ||
+               type == XFS_BLF_PDQUOT_BUF ||
-               type == XFS_BLI_GDQUOT_BUF);
+               type == XFS_BLF_GDQUOT_BUF);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_format.blf_flags |= type;
 }
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Only check the first, embedded
- * chunk, since we don't want to spend all day scanning large transactions.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        licp = &tp->t_items;
-        if (!xfs_lic_are_all_free(licp)) {
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                break;
-                        } else {
-                                bp = NULL;
-                        }
-                }
-        }
-        return bp;
-}
-/*
- * Check to see if a buffer matching the given parameters is already
- * a part of the given transaction.  Check all the chunks, we
- * want to be thorough.
- */
-STATIC xfs_buf_t *
-xfs_trans_buf_item_match_all(
-        xfs_trans_t     *tp,
-        xfs_buftarg_t   *target,
-        xfs_daddr_t     blkno,
-        int             len)
-{
-        xfs_log_item_chunk_t    *licp;
-        xfs_log_item_desc_t     *lidp;
-        xfs_buf_log_item_t      *blip;
-        xfs_buf_t               *bp;
-        int                     i;
-        bp = NULL;
-        len = BBTOB(len);
-        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (xfs_lic_are_all_free(licp)) {
-                        ASSERT(licp == &tp->t_items);
-                        ASSERT(licp->lic_next == NULL);
-                        return NULL;
-                }
-                for (i = 0; i < licp->lic_unused; i++) {
-                        /*
-                         * Skip unoccupied slots.
-                         */
-                        if (xfs_lic_isfree(licp, i)) {
-                                continue;
-                        }
-                        lidp = xfs_lic_slot(licp, i);
-                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
-                        if (blip->bli_item.li_type != XFS_LI_BUF) {
-                                continue;
-                        }
-                        bp = blip->bli_buf;
-                        if ((XFS_BUF_TARGET(bp) == target) &&
-                            (XFS_BUF_ADDR(bp) == blkno) &&
-                            (XFS_BUF_COUNT(bp) == len)) {
-                                /*
-                                 * We found it.  Break out and
-                                 * return the pointer to the buffer.
-                                 */
-                                return bp;
-                        }
-                }
-        }
-        return NULL;
-}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 void
 xfs_trans_free_items(
        xfs_trans_t     *tp,
+        xfs_lsn_t       commit_lsn,
        int             flags)
 {
        xfs_log_item_chunk_t    *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
         * Special case the embedded chunk so we don't free it below.
         */
        if (!xfs_lic_are_all_free(licp)) {
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
                xfs_lic_all_free(licp);
                licp->lic_unused = 0;
        }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
         */
        while (licp != NULL) {
                ASSERT(!xfs_lic_are_all_free(licp));
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
                next_licp = licp->lic_next;
                kmem_free(licp);
                licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
        return freed;
 }
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items.  It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry.  The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i=0;
-        /*
-         * If there are no free descriptors, allocate a new chunk
-         * of them and put it at the front of the chunk list.
-         */
-        if (tp->t_busy_free == 0) {
-                lbcp = (xfs_log_busy_chunk_t*)
-                       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
-                ASSERT(lbcp != NULL);
-                /*
-                 * Initialize the chunk, and then
-                 * claim the first slot in the newly allocated chunk.
-                 */
-                XFS_LBC_INIT(lbcp);
-                XFS_LBC_CLAIM(lbcp, 0);
-                lbcp->lbc_unused = 1;
-                lbsp = XFS_LBC_SLOT(lbcp, 0);
-                /*
-                 * Link in the new chunk and update the free count.
-                 */
-                lbcp->lbc_next = tp->t_busy.lbc_next;
-                tp->t_busy.lbc_next = lbcp;
-                tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-                /*
-                 * Initialize the descriptor and the generic portion
-                 * of the log item.
-                 *
-                 * Point the new slot at this item and return it.
-                 * Also point the log item at its currently active
-                 * descriptor and set the item's mount pointer.
-                 */
-                lbsp->lbc_ag = ag;
-                lbsp->lbc_idx = idx;
-                return lbsp;
-        }
-        /*
-         * Find the free descriptor. It is somewhere in the chunklist
-         * of descriptors.
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                if (XFS_LBC_VACANCY(lbcp)) {
-                        if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
-                                i = lbcp->lbc_unused;
-                                break;
-                        } else {
-                                /* out-of-order vacancy */
-                                cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
-                                ASSERT(0);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        ASSERT(lbcp != NULL);
-        /*
-         * If we find a free descriptor, claim it,
-         * initialize it, and return it.
-         */
-        XFS_LBC_CLAIM(lbcp, i);
-        if (lbcp->lbc_unused <= i) {
-                lbcp->lbc_unused = i + 1;
-        }
-        lbsp = XFS_LBC_SLOT(lbcp, i);
-        tp->t_busy_free--;
-        lbsp->lbc_ag = ag;
-        lbsp->lbc_idx = idx;
-        return lbsp;
-}
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_chunk_t    *lbcq;
-        lbcp = tp->t_busy.lbc_next;
-        while (lbcp != NULL) {
-                lbcq = lbcp->lbc_next;
-                kmem_free(lbcp);
-                lbcp = lbcq;
-        }
-        XFS_LBC_INIT(&tp->t_busy);
-        tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
 struct xfs_log_item_desc        *xfs_trans_first_item(struct xfs_trans *);
 struct xfs_log_item_desc        *xfs_trans_next_item(struct xfs_trans *,
                                             struct xfs_log_item_desc *);
-void                            xfs_trans_free_items(struct xfs_trans *, int);
-void                            xfs_trans_unlock_items(struct xfs_trans *,
+void    xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
-                                                        xfs_lsn_t);
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-void                            xfs_trans_free_busy(xfs_trans_t *tp);
+                                int flags);
-xfs_log_busy_slot_t             *xfs_trans_add_busy(xfs_trans_t *tp,
-                                                    xfs_agnumber_t ag,
+void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                                    xfs_extlen_t idx);
+                                xfs_lsn_t commit_lsn, int aborted);
+void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 /*
 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
+typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types: