237 files changed, 14750 insertions, 5711 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index ff0e81980207..32883589ee54 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
        select CONFIGFS_FS
        select JBD2
        select CRC32
+        select QUOTA
+        select QUOTA_TREE
        help
          OCFS2 is a general purpose extent based shared disk cluster file
          system with many similarities to ext3. It supports 64 bit inode
@@ -258,15 +260,14 @@ config OCFS2_DEBUG_FS
          this option for debugging only as it is likely to decrease
          performance of the filesystem.
-config OCFS2_COMPAT_JBD
+config OCFS2_FS_POSIX_ACL
-        bool "Use JBD for compatibility"
+        bool "OCFS2 POSIX Access Control Lists"
        depends on OCFS2_FS
+        select FS_POSIX_ACL
        default n
-        select JBD
        help
-          The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
+          Posix Access Control Lists (ACLs) support permissions for users and
-          is backwards compatible with JBD.  It is safe to say N here.
+          groups beyond the owner/group/world scheme.
-          However, if you really want to use the original JBD, say Y here.
 endif # BLOCK
@@ -303,6 +304,10 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+         tristate
 config QFMT_V1
        tristate "Old quota format support"
        depends on QUOTA
@@ -314,6 +319,7 @@ config QFMT_V1
 config QFMT_V2
        tristate "Quota format v2 support"
        depends on QUOTA
+        select QUOTA_TREE
        help
          This quota format allows using quotas with 32-bit UIDs/GIDs. If you
          need this functionality say Y here.
@@ -715,7 +721,20 @@ config CONFIGFS_FS
 endmenu
-menu "Miscellaneous filesystems"
+menuconfig MISC_FILESYSTEMS
+        bool "Miscellaneous filesystems"
+        default y
+        ---help---
+          Say Y here to get to see options for various miscellaneous
+          filesystems, such as filesystems that came from other
+          operating systems.
+          This option alone does not add any kernel code.
+          If you say N, all options in this submenu will be skipped and
+          disabled; if unsure, say Y here.
+if MISC_FILESYSTEMS
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
@@ -1085,7 +1104,7 @@ config UFS_DEBUG
          Y here.  This will result in _many_ additional debugging messages to be
          written to the system log.
-endmenu
+endif # MISC_FILESYSTEMS
 menuconfig NETWORK_FILESYSTEMS
        bool "Network File Systems"
diff --git a/fs/Makefile b/fs/Makefile
index e6f423d1d228..c830611550d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_QUOTA)             += dquot.o
 obj-$(CONFIG_QFMT_V1)           += quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)        += quota_tree.o
 obj-$(CONFIG_QUOTACTL)          += quota.o
 obj-$(CONFIG_PROC_FS)           += proc/
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
                goto bad_inode;
 #else
                inode->i_mode |= S_IFDIR;
-                inode->i_op = NULL;
+                /* ... and leave ->i_op and ->i_fop pointing to empty */
-                inode->i_fop = NULL;
                break;
 #endif
        case ST_LINKFILE:
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_nlink = 2;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blocks = 0;
        if (ino == AUTOFS_ROOT_INO) {
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
                inode->i_op = &autofs_root_inode_operations;
                inode->i_fop = &autofs_root_operations;
-                inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
                goto done;
        } 
        
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
 #define AUTOFS_DEV_IOCTL_IOC_FIRST      (AUTOFS_DEV_IOCTL_VERSION)
 #define AUTOFS_DEV_IOCTL_IOC_COUNT      (AUTOFS_IOC_COUNT - 11)
-#define AUTOFS_TYPE_TRIGGER     (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 /*
 * Check sanity of parameter control fields and if a path is present
- * check that it has a "/" and is terminated.
+ * check that it is terminated and contains at least one "/".
 */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        }
        if (param->size > sizeof(*param)) {
-                err = check_name(param->path);
+                err = invalid_str(param->path,
+                                 (void *) ((size_t) param + param->size));
                if (err) {
-                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+                        AUTOFS_WARN(
-                                    cmd);
+                          "path string terminator missing for cmd(0x%08x)",
+                          cmd);
                        goto out;
                }
-                err = invalid_str(param->path,
+                err = check_name(param->path);
-                                 (void *) ((size_t) param + param->size));
                if (err) {
                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
                                    cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
                                     struct autofs_sb_info *sbi,
                                     struct autofs_dev_ioctl *param)
 {
-        param->arg1 = sbi->version;
+        param->protover.version = sbi->version;
        return 0;
 }
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
                                        struct autofs_sb_info *sbi,
                                        struct autofs_dev_ioctl *param)
 {
-        param->arg1 = sbi->sub_version;
+        param->protosubver.sub_version = sbi->sub_version;
        return 0;
 }
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
        int err, fd;
        /* param->path has already been checked */
-        if (!param->arg1)
+        if (!param->openmount.devid)
                return -EINVAL;
        param->ioctlfd = -1;
        path = param->path;
-        devid = param->arg1;
+        devid = param->openmount.devid;
        err = 0;
        fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
 {
        autofs_wqt_t token;
-        token = (autofs_wqt_t) param->arg1;
+        token = (autofs_wqt_t) param->ready.token;
        return autofs4_wait_release(sbi, token, 0);
 }
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
        autofs_wqt_t token;
        int status;
-        token = (autofs_wqt_t) param->arg1;
+        token = (autofs_wqt_t) param->fail.token;
-        status = param->arg2 ? param->arg2 : -ENOENT;
+        status = param->fail.status ? param->fail.status : -ENOENT;
        return autofs4_wait_release(sbi, token, status);
 }
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
        int pipefd;
        int err = 0;
-        if (param->arg1 == -1)
+        if (param->setpipefd.pipefd == -1)
                return -EINVAL;
-        pipefd = param->arg1;
+        pipefd = param->setpipefd.pipefd;
        mutex_lock(&sbi->wq_mutex);
        if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
 {
        unsigned long timeout;
-        timeout = param->arg1;
+        timeout = param->timeout.timeout;
-        param->arg1 = sbi->exp_timeout / HZ;
+        param->timeout.timeout = sbi->exp_timeout / HZ;
        sbi->exp_timeout = timeout * HZ;
        return 0;
 }
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
        path = param->path;
        devid = sbi->sb->s_dev;
-        param->arg1 = param->arg2 = -1;
+        param->requester.uid = param->requester.gid = -1;
        /* Get nameidata of the parent directory */
        err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                err = 0;
                autofs4_expire_wait(nd.path.dentry);
                spin_lock(&sbi->fs_lock);
-                param->arg1 = ino->uid;
+                param->requester.uid = ino->uid;
-                param->arg2 = ino->gid;
+                param->requester.gid = ino->gid;
                spin_unlock(&sbi->fs_lock);
        }
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
        int err = -EAGAIN;
        int how;
-        how = param->arg1;
+        how = param->expire.how;
        mnt = fp->f_path.mnt;
-        if (sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (autofs_type_trigger(sbi->type))
                dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
        else
                dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
                                      struct autofs_sb_info *sbi,
                                      struct autofs_dev_ioctl *param)
 {
-        param->arg1 = 0;
+        param->askumount.may_umount = 0;
        if (may_umount(fp->f_path.mnt))
-                param->arg1 = 1;
+                param->askumount.may_umount = 1;
        return 0;
 }
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
        struct nameidata nd;
        const char *path;
        unsigned int type;
+        unsigned int devid, magic;
        int err = -ENOENT;
        if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
        }
        path = param->path;
-        type = param->arg1;
+        type = param->ismountpoint.in.type;
-        param->arg1 = 0;
+        param->ismountpoint.out.devid = devid = 0;
-        param->arg2 = 0;
+        param->ismountpoint.out.magic = magic = 0;
        if (!fp || param->ioctlfd == -1) {
-                if (type == AUTOFS_TYPE_ANY) {
+                if (autofs_type_any(type)) {
                        struct super_block *sb;
                        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                goto out;
                        sb = nd.path.dentry->d_sb;
-                        param->arg1 = new_encode_dev(sb->s_dev);
+                        devid = new_encode_dev(sb->s_dev);
                } else {
                        struct autofs_info *ino;
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                goto out_release;
                        ino = autofs4_dentry_ino(nd.path.dentry);
-                        param->arg1 = autofs4_get_dev(ino->sbi);
+                        devid = autofs4_get_dev(ino->sbi);
                }
                err = 0;
                if (nd.path.dentry->d_inode &&
                    nd.path.mnt->mnt_root == nd.path.dentry) {
                        err = 1;
-                        param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+                        magic = nd.path.dentry->d_inode->i_sb->s_magic;
                }
        } else {
-                dev_t devid = new_encode_dev(sbi->sb->s_dev);
+                dev_t dev = autofs4_get_dev(sbi);
                err = path_lookup(path, LOOKUP_PARENT, &nd);
                if (err)
                        goto out;
-                err = autofs_dev_ioctl_find_super(&nd, devid);
+                err = autofs_dev_ioctl_find_super(&nd, dev);
                if (err)
                        goto out_release;
-                param->arg1 = autofs4_get_dev(sbi);
+                devid = dev;
                err = have_submounts(nd.path.dentry);
                if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
                        if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
                                struct inode *inode = nd.path.dentry->d_inode;
-                                param->arg2 = inode->i_sb->s_magic;
+                                magic = inode->i_sb->s_magic;
                        }
                }
        }
+        param->ismountpoint.out.devid = devid;
+        param->ismountpoint.out.magic = magic;
 out_release:
        path_put(&nd.path);
 out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
                /* This is an autofs submount, we can't expire it */
-                if (sbi->type == AUTOFS_TYPE_INDIRECT)
+                if (autofs_type_indirect(sbi->type))
                        goto done;
                /*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
        if (arg && get_user(do_now, arg))
                return -EFAULT;
-        if (sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (autofs_type_trigger(sbi->type))
                dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
        else
                dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, ",minproto=%d", sbi->min_proto);
        seq_printf(m, ",maxproto=%d", sbi->max_proto);
-        if (sbi->type & AUTOFS_TYPE_OFFSET)
+        if (autofs_type_offset(sbi->type))
                seq_printf(m, ",offset");
-        else if (sbi->type & AUTOFS_TYPE_DIRECT)
+        else if (autofs_type_direct(sbi->type))
                seq_printf(m, ",direct");
        else
                seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
                        *maxproto = option;
                        break;
                case Opt_indirect:
-                        *type = AUTOFS_TYPE_INDIRECT;
+                        set_autofs_type_indirect(type);
                        break;
                case Opt_direct:
-                        *type = AUTOFS_TYPE_DIRECT;
+                        set_autofs_type_direct(type);
                        break;
                case Opt_offset:
-                        *type = AUTOFS_TYPE_OFFSET;
+                        set_autofs_type_offset(type);
                        break;
                default:
                        return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi->sb = s;
        sbi->version = 0;
        sbi->sub_version = 0;
-        sbi->type = AUTOFS_TYPE_INDIRECT;
+        set_autofs_type_indirect(&sbi->type);
        sbi->min_proto = 0;
        sbi->max_proto = 0;
        mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        }
        root_inode->i_fop = &autofs4_root_operations;
-        root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
+        root_inode->i_op = autofs_type_trigger(sbi->type) ?
                        &autofs4_direct_root_inode_operations :
                        &autofs4_indirect_root_inode_operations;
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
-        } else {
-                inode->i_uid = 0;
-                inode->i_gid = 0;
        }
-        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                 * is very similar for indirect mounts except only dentrys
                 * in the root of the autofs file system may be negative.
                 */
-                if (sbi->type & AUTOFS_TYPE_TRIGGER)
+                if (autofs_type_trigger(sbi->type))
                        return -ENOENT;
                else if (!IS_ROOT(dentry->d_parent))
                        return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                return -ENOMEM;
        /* If this is a direct mount request create a dummy name */
-        if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
                qstr.len = sprintf(name, "%p", dentry);
        else {
                qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                                type = autofs_ptype_expire_multi;
                } else {
                        if (notify == NFY_MOUNT)
-                                type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+                                type = autofs_type_trigger(sbi->type) ?
                                        autofs_ptype_missing_direct :
                                         autofs_ptype_missing_indirect;
                        else
-                                type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+                                type = autofs_type_trigger(sbi->type) ?
                                        autofs_ptype_expire_direct :
                                        autofs_ptype_expire_indirect;
                }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
+        if (!info)
+                return;
        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        unsigned i, imap_len;
        struct bfs_sb_info *info;
        long ret = -EINVAL;
+        unsigned long i_sblock, i_eblock, i_eoff, s_size;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        s->s_magic = BFS_MAGIC;
        info->si_sbh = bh;
+        if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+                printf("Superblock is corrupted\n");
+                goto out;
+        }
        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
                                        sizeof(struct bfs_inode)
                                        + BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                        - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
        info->si_freei = 0;
        info->si_lf_eblk = 0;
+        /* can we read the last block? */
+        bh = sb_bread(s, info->si_blocks - 1);
+        if (!bh) {
+                printf("Last block not available: %lu\n", info->si_blocks - 1);
+                iput(inode);
+                ret = -EIO;
+                kfree(info->si_imap);
+                goto out;
+        }
+        brelse(bh);
        bh = NULL;
        for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
                struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                di = (struct bfs_inode *)bh->b_data + off;
+                /* test if filesystem is not corrupted */
+                i_eoff = le32_to_cpu(di->i_eoffset);
+                i_sblock = le32_to_cpu(di->i_sblock);
+                i_eblock = le32_to_cpu(di->i_eblock);
+                s_size = le32_to_cpu(bfs_sb->s_end);
+                if (i_sblock > info->si_blocks ||
+                        i_eblock > info->si_blocks ||
+                        i_sblock > i_eblock ||
+                        i_eoff > s_size ||
+                        i_sblock * BFS_BSIZE > i_eoff) {
+                        printf("Inode 0x%08x corrupted\n", i);
+                        brelse(bh);
+                        s->s_root = NULL;
+                        kfree(info->si_imap);
+                        kfree(info);
+                        s->s_fs_info = NULL;
+                        return -EIO;
+                }
                if (!di->i_ino) {
                        info->si_freei++;
                        continue;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        elf_addr_t __user *sp;
        elf_addr_t __user *u_platform;
        elf_addr_t __user *u_base_platform;
+        elf_addr_t __user *u_rand_bytes;
        const char *k_platform = ELF_PLATFORM;
        const char *k_base_platform = ELF_BASE_PLATFORM;
+        unsigned char k_rand_bytes[16];
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                        return -EFAULT;
        }
+        /*
+         * Generate 16 random bytes for userspace PRNG seeding.
+         */
+        get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+        u_rand_bytes = (elf_addr_t __user *)
+                       STACK_ALLOC(p, sizeof(k_rand_bytes));
+        if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+                return -EFAULT;
        /* Create the ELF interpreter info */
        elf_info = (elf_addr_t *)current->mm->saved_auxv;
        /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        NEW_AUX_ENT(AT_GID, cred->gid);
        NEW_AUX_ENT(AT_EGID, cred->egid);
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+        NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
        if (k_platform) {
                NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
        }
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-        char *s = enabled ? "enabled" : "disabled";
+        char *s = enabled ? "enabled\n" : "disabled\n";
        return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
diff --git a/fs/bio.c b/fs/bio.c
index 711cee103602..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        int i, ret;
        int nr_pages = 0;
        unsigned int len = 0;
+        unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
        for (i = 0; i < iov_count; i++) {
                unsigned long uaddr;
@@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        bio->bi_rw |= (!write_to_vm << BIO_RW);
        ret = 0;
-        i = 0;
+        if (map_data) {
+                nr_pages = 1 << map_data->page_order;
+                i = map_data->offset / PAGE_SIZE;
+        }
        while (len) {
-                unsigned int bytes;
+                unsigned int bytes = PAGE_SIZE;
-                if (map_data)
+                bytes -= offset;
-                        bytes = 1U << (PAGE_SHIFT + map_data->page_order);
-                else
-                        bytes = PAGE_SIZE;
                if (bytes > len)
                        bytes = len;
                if (map_data) {
-                        if (i == map_data->nr_entries) {
+                        if (i == map_data->nr_entries * nr_pages) {
                                ret = -ENOMEM;
                                break;
                        }
-                        page = map_data->pages[i++];
-                } else
+                        page = map_data->pages[i / nr_pages];
+                        page += (i % nr_pages);
+                        i++;
+                } else {
                        page = alloc_page(q->bounce_gfp | gfp_mask);
-                if (!page) {
+                        if (!page) {
-                        ret = -ENOMEM;
+                                ret = -ENOMEM;
-                        break;
+                                break;
+                        }
                }
-                if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+                if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
                        break;
                len -= bytes;
+                offset = 0;
        }
        if (ret)
@@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        /*
         * success
         */
-        if (!write_to_vm) {
+        if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
                if (ret)
                        goto cleanup;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1dd07e66e98a..ac7031f12ea5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        }
        lock_kernel();
+ restart:
        ret = -ENXIO;
        disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev, mode);
+                                if (ret == -ERESTARTSYS) {
+                                        /* Lost a race with 'disk' being
+                                         * deleted, try again.
+                                         * See md.c
+                                         */
+                                        disk_put_part(bdev->bd_part);
+                                        bdev->bd_part = NULL;
+                                        module_put(disk->fops->owner);
+                                        put_disk(disk);
+                                        bdev->bd_disk = NULL;
+                                        mutex_unlock(&bdev->bd_mutex);
+                                        goto restart;
+                                }
                                if (ret)
                                        goto out_clear;
                        }
@@ -1277,7 +1291,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 /**
 * lookup_bdev  - lookup a struct block_device by name
- * @path:       special file representing the block device
+ * @pathname:   special file representing the block device
 *
 * Get a reference to the blockdevice at @pathname in the current
 * namespace if possible and return it.  Return ERR_PTR(error)
diff --git a/fs/buffer.c b/fs/buffer.c
index a13f09b696f7..c26da785938a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        if (pos + len > inode->i_size)
                                vmtruncate(inode, inode->i_size);
                }
-                goto out;
        }
 out:
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
        cd->major = major;
        cd->baseminor = baseminor;
        cd->minorct = minorct;
-        strncpy(cd->name,name, 64);
+        strlcpy(cd->name, name, sizeof(cd->name));
        i = major_to_index(major);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..5ab9896fdcb2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
        i_size_write(inode, offset);
        spin_unlock(&inode->i_lock);
 out_truncate:
-        if (inode->i_op && inode->i_op->truncate)
+        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 {
        struct file *host_file;
-        struct dentry *host_dentry;
+        struct inode *coda_inode = coda_dentry->d_inode;
-        struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        if (host_file->f_op && host_file->f_op->fsync) {
+        err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
-                host_dentry = host_file->f_path.dentry;
-                host_inode = host_dentry->d_inode;
-                mutex_lock(&host_inode->i_mutex);
-                err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-                mutex_unlock(&host_inode->i_mutex);
-        }
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
 #include "coda_int.h"
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
+#endif
 static ctl_table coda_table[] = {
        {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
        {}
 };
+#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
        },
        {}
 };
+#endif
 void coda_sysctl_init(void)
 {
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..30f2faa22f5c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
        ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
 out:
+        if (ret > 0)
+                add_rchar(current, ret);
+        inc_syscr(current);
        fput(file);
        return ret;
 }
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
        ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
 out:
+        if (ret > 0)
+                add_wchar(current, ret);
+        inc_syscw(current);
        fput(file);
        return ret;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_data.a_ops = &cramfs_aops;
                } else {
-                        inode->i_size = 0;
-                        inode->i_blocks = 0;
                        init_special_inode(inode, inode->i_mode,
                                old_decode_dev(cramfs_inode->size));
                }
diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a32..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
        spin_unlock(&dcache_lock);
 }
-#define do_switch(x,y) do { \
-        __typeof__ (x) __tmp = x; \
-        x = y; y = __tmp; } while (0)
 /*
 * When switching names, the actual string doesn't strictly have to
 * be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        /*
                         * Both external: swap the pointers
                         */
-                        do_switch(target->d_name.name, dentry->d_name.name);
+                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        return;
                }
        }
-        do_switch(dentry->d_name.len, target->d_name.len);
+        swap(dentry->d_name.len, target->d_name.len);
 }
 /*
@@ -1680,7 +1676,7 @@ already_unhashed:
        /* Switch the names.. */
        switch_names(dentry, target);
-        do_switch(dentry->d_name.hash, target->d_name.hash);
+        swap(dentry->d_name.hash, target->d_name.hash);
        /* ... and switch the parents */
        if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
                target->d_parent = target;
                INIT_LIST_HEAD(&target->d_u.d_child);
        } else {
-                do_switch(dentry->d_parent, target->d_parent);
+                swap(dentry->d_parent, target->d_parent);
                /* And add them back to the (new) parent lists */
                list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        struct dentry *dparent, *aparent;
        switch_names(dentry, anon);
-        do_switch(dentry->d_name.hash, anon->d_name.hash);
+        swap(dentry->d_name.hash, anon->d_name.hash);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+static int debugfs_size_t_set(void *data, u64 val)
+{
+        *(size_t *)data = val;
+        return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+        *val = *(size_t *)data;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+                        "%llu\n");      /* %llu and %zu are more or less the same */
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+                                     struct dentry *parent, size_t *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
                default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fff96e152c0c..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb)
        }
        inode->i_ino = 2;
-        inode->i_uid = inode->i_gid = 0;
-        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        mode = S_IFCHR|opts->ptmxmode;
@@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
                goto free_fsi;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blocks = 0;
-        inode->i_uid = inode->i_gid = 0;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
                                nr_segs, blkbits, get_block, end_io, dio);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+         * it's own meaner.
+         */
+        if (unlikely(retval < 0 && (rw & WRITE))) {
+                loff_t isize = i_size_read(inode);
+                if (end > isize && dio_lock_type == DIO_LOCKING)
+                        vmtruncate(inode, isize);
+        }
        if (rw == READ && dio_lock_type == DIO_LOCKING)
                release_i_mutex = 0;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
        spin_unlock(&ast_queue_lock);
 }
-void dlm_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
        if (lkb->lkb_flags & DLM_IFL_USER) {
-                dlm_user_add_ast(lkb, type);
+                dlm_user_add_ast(lkb, type, bastmode);
                return;
        }
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
        }
        lkb->lkb_ast_type |= type;
+        if (bastmode)
+                lkb->lkb_bastmode = bastmode;
        spin_unlock(&ast_queue_lock);
        set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
        struct dlm_lkb *lkb;
        void (*cast) (void *astparam);
        void (*bast) (void *astparam, int mode);
-        int type = 0, found, bmode;
+        int type = 0, bastmode;
-        for (;;) {
+repeat:
-                found = 0;
+        spin_lock(&ast_queue_lock);
-                spin_lock(&ast_queue_lock);
+        list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
-                list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+                r = lkb->lkb_resource;
-                        r = lkb->lkb_resource;
+                ls = r->res_ls;
-                        ls = r->res_ls;
+                if (dlm_locking_stopped(ls))
-                        if (dlm_locking_stopped(ls))
+                        continue;
-                                continue;
-                        list_del(&lkb->lkb_astqueue);
-                        type = lkb->lkb_ast_type;
-                        lkb->lkb_ast_type = 0;
-                        found = 1;
-                        break;
-                }
-                spin_unlock(&ast_queue_lock);
-                if (!found)
+                list_del(&lkb->lkb_astqueue);
-                        break;
+                type = lkb->lkb_ast_type;
+                lkb->lkb_ast_type = 0;
+                bastmode = lkb->lkb_bastmode;
+                spin_unlock(&ast_queue_lock);
                cast = lkb->lkb_astfn;
                bast = lkb->lkb_bastfn;
-                bmode = lkb->lkb_bastmode;
                if ((type & AST_COMP) && cast)
                        cast(lkb->lkb_astparam);
-                /* FIXME: Is it safe to look at lkb_grmode here
-                   without doing a lock_rsb() ?
-                   Look at other checks in v1 to avoid basts. */
                if ((type & AST_BAST) && bast)
-                        if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+                        bast(lkb->lkb_astparam, bastmode);
-                                bast(lkb->lkb_astparam, bmode);
                /* this removes the reference added by dlm_add_ast
                   and may result in the lkb being freed */
                dlm_put_lkb(lkb);
-                schedule();
+                cond_resched();
+                goto repeat;
        }
+        spin_unlock(&ast_queue_lock);
 }
 static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..2f107d1a6a45 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
 struct rsb_iter {
        int entry;
-        int locks;
+        int format;
        int header;
        struct dlm_ls *ls;
        struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
        }
 }
-static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
-                                struct dlm_rsb *res)
+                               struct dlm_rsb *res)
 {
        seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
        seq_printf(s, "\n");
 }
-static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
        struct dlm_lkb *lkb;
        int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
        /* Print the locks attached to this resource */
        seq_printf(s, "Granted Queue\n");
        list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
-                print_resource_lock(s, lkb, res);
+                print_format1_lock(s, lkb, res);
        seq_printf(s, "Conversion Queue\n");
        list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
-                print_resource_lock(s, lkb, res);
+                print_format1_lock(s, lkb, res);
        seq_printf(s, "Waiting Queue\n");
        list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
-                print_resource_lock(s, lkb, res);
+                print_format1_lock(s, lkb, res);
        if (list_empty(&res->res_lookup))
                goto out;
@@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
        return 0;
 }
-static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                               struct dlm_rsb *r)
 {
-        unsigned int waiting = 0;
+        u64 xid = 0;
-        uint64_t xid = 0;
+        u64 us;
        if (lkb->lkb_flags & DLM_IFL_USER) {
                if (lkb->lkb_ua)
                        xid = lkb->lkb_ua->xid;
        }
-        if (lkb->lkb_timestamp)
+        /* microseconds since lkb was added to current queue */
-                waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
+        us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
-        /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
+        /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
           r_nodeid r_len r_name */
-        seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
+        seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
                   lkb->lkb_id,
                   lkb->lkb_nodeid,
                   lkb->lkb_remid,
@@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
                   lkb->lkb_status,
                   lkb->lkb_grmode,
                   lkb->lkb_rqmode,
-                   waiting,
+                   (unsigned long long)us,
                   r->res_nodeid,
                   r->res_length,
                   r->res_name);
 }
-static int print_locks(struct dlm_rsb *r, struct seq_file *s)
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
        struct dlm_lkb *lkb;
        lock_rsb(r);
        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-                print_lock(s, lkb, r);
+                print_format2_lock(s, lkb, r);
        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-                print_lock(s, lkb, r);
+                print_format2_lock(s, lkb, r);
        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-                print_lock(s, lkb, r);
+                print_format2_lock(s, lkb, r);
+        unlock_rsb(r);
+        return 0;
+}
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                               int rsb_lookup)
+{
+        u64 xid = 0;
+        if (lkb->lkb_flags & DLM_IFL_USER) {
+                if (lkb->lkb_ua)
+                        xid = lkb->lkb_ua->xid;
+        }
+        seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+                   lkb->lkb_id,
+                   lkb->lkb_nodeid,
+                   lkb->lkb_remid,
+                   lkb->lkb_ownpid,
+                   (unsigned long long)xid,
+                   lkb->lkb_exflags,
+                   lkb->lkb_flags,
+                   lkb->lkb_status,
+                   lkb->lkb_grmode,
+                   lkb->lkb_rqmode,
+                   lkb->lkb_highbast,
+                   rsb_lookup,
+                   lkb->lkb_wait_type,
+                   lkb->lkb_lvbseq,
+                   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+                   (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+}
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+        struct dlm_lkb *lkb;
+        int i, lvblen = r->res_ls->ls_lvblen;
+        int print_name = 1;
+        lock_rsb(r);
+        seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+                   r,
+                   r->res_nodeid,
+                   r->res_first_lkid,
+                   r->res_flags,
+                   !list_empty(&r->res_root_list),
+                   !list_empty(&r->res_recover_list),
+                   r->res_recover_locks_count,
+                   r->res_length);
+        for (i = 0; i < r->res_length; i++) {
+                if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+                        print_name = 0;
+        }
+        seq_printf(s, "%s", print_name ? "str " : "hex");
+        for (i = 0; i < r->res_length; i++) {
+                if (print_name)
+                        seq_printf(s, "%c", r->res_name[i]);
+                else
+                        seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+        }
+        seq_printf(s, "\n");
+        if (!r->res_lvbptr)
+                goto do_locks;
+        seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+        for (i = 0; i < lvblen; i++)
+                seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+        seq_printf(s, "\n");
+ do_locks:
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+                print_format3_lock(s, lkb, 0);
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+                print_format3_lock(s, lkb, 0);
+        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+                print_format3_lock(s, lkb, 0);
+        list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+                print_format3_lock(s, lkb, 1);
        unlock_rsb(r);
        return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
                                break;
                        }
                        read_unlock(&ls->ls_rsbtbl[i].lock);
-                }
+                }
                ri->entry = i;
                if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
                        read_unlock(&ls->ls_rsbtbl[i].lock);
                        dlm_put_rsb(old);
                        goto top;
-                }
+                }
                ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
                dlm_hold_rsb(ri->rsb);
                read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
        ri->ls = ls;
        ri->entry = 0;
        ri->next = NULL;
+        ri->format = 1;
        if (rsb_iter_next(ri)) {
                rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
 {
        struct rsb_iter *ri = iter_ptr;
-        if (ri->locks) {
+        switch (ri->format) {
+        case 1:
+                print_format1(ri->rsb, file);
+                break;
+        case 2:
                if (ri->header) {
-                        seq_printf(file, "id nodeid remid pid xid exflags flags "
+                        seq_printf(file, "id nodeid remid pid xid exflags "
-                                         "sts grmode rqmode time_ms r_nodeid "
+                                         "flags sts grmode rqmode time_ms "
-                                         "r_len r_name\n");
+                                         "r_nodeid r_len r_name\n");
                        ri->header = 0;
                }
-                print_locks(ri->rsb, file);
+                print_format2(ri->rsb, file);
-        } else {
+                break;
-                print_resource(ri->rsb, file);
+        case 3:
+                if (ri->header) {
+                        seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+                        ri->header = 0;
+                }
+                print_format3(ri->rsb, file);
+                break;
        }
        return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
        ri->ls = ls;
        ri->entry = 0;
        ri->next = NULL;
-        ri->locks = 1;
+        ri->format = 2;
        if (*pos == 0)
                ri->header = 1;
@@ -448,6 +548,84 @@ static const struct file_operations locks_fops = {
 };
 /*
+ * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
+ * This can replace both formats 1 and 2 eventually.
+ */
+static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
+{
+        struct rsb_iter *ri;
+        ri = kzalloc(sizeof *ri, GFP_KERNEL);
+        if (!ri)
+                return NULL;
+        ri->ls = ls;
+        ri->entry = 0;
+        ri->next = NULL;
+        ri->format = 3;
+        if (*pos == 0)
+                ri->header = 1;
+        if (rsb_iter_next(ri)) {
+                rsb_iter_free(ri);
+                return NULL;
+        }
+        return ri;
+}
+static void *all_seq_start(struct seq_file *file, loff_t *pos)
+{
+        struct rsb_iter *ri;
+        loff_t n = *pos;
+        ri = all_iter_init(file->private, pos);
+        if (!ri)
+                return NULL;
+        while (n--) {
+                if (rsb_iter_next(ri)) {
+                        rsb_iter_free(ri);
+                        return NULL;
+                }
+        }
+        return ri;
+}
+static struct seq_operations all_seq_ops = {
+        .start = all_seq_start,
+        .next  = rsb_seq_next,
+        .stop  = rsb_seq_stop,
+        .show  = rsb_seq_show,
+};
+static int all_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &all_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private;
+        return 0;
+}
+static const struct file_operations all_fops = {
+        .owner   = THIS_MODULE,
+        .open    = all_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+/*
 * dump lkb's on the ls_waiters list
 */
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
        .read    = waiters_read
 };
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+        if (ls->ls_debug_rsb_dentry)
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+        if (ls->ls_debug_waiters_dentry)
+                debugfs_remove(ls->ls_debug_waiters_dentry);
+        if (ls->ls_debug_locks_dentry)
+                debugfs_remove(ls->ls_debug_locks_dentry);
+        if (ls->ls_debug_all_dentry)
+                debugfs_remove(ls->ls_debug_all_dentry);
+}
 int dlm_create_debug_file(struct dlm_ls *ls)
 {
        char name[DLM_LOCKSPACE_LEN+8];
+        /* format 1 */
        ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
                                                      S_IFREG | S_IRUGO,
                                                      dlm_root,
                                                      ls,
                                                      &rsb_fops);
        if (!ls->ls_debug_rsb_dentry)
-                return -ENOMEM;
+                goto fail;
-        memset(name, 0, sizeof(name));
+        /* format 2 */
-        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
-        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
-                                                          S_IFREG | S_IRUGO,
-                                                          dlm_root,
-                                                          ls,
-                                                          &waiters_fops);
-        if (!ls->ls_debug_waiters_dentry) {
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-                return -ENOMEM;
-        }
        memset(name, 0, sizeof(name));
        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
                                                        dlm_root,
                                                        ls,
                                                        &locks_fops);
-        if (!ls->ls_debug_locks_dentry) {
+        if (!ls->ls_debug_locks_dentry)
-                debugfs_remove(ls->ls_debug_waiters_dentry);
+                goto fail;
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-                return -ENOMEM;
+        /* format 3 */
-        }
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+        ls->ls_debug_all_dentry = debugfs_create_file(name,
+                                                      S_IFREG | S_IRUGO,
+                                                      dlm_root,
+                                                      ls,
+                                                      &all_fops);
+        if (!ls->ls_debug_all_dentry)
+                goto fail;
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+                                                          S_IFREG | S_IRUGO,
+                                                          dlm_root,
+                                                          ls,
+                                                          &waiters_fops);
+        if (!ls->ls_debug_waiters_dentry)
+                goto fail;
        return 0;
-}
-void dlm_delete_debug_file(struct dlm_ls *ls)
+ fail:
-{
+        dlm_delete_debug_file(ls);
-        if (ls->ls_debug_rsb_dentry)
+        return -ENOMEM;
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-        if (ls->ls_debug_waiters_dentry)
-                debugfs_remove(ls->ls_debug_waiters_dentry);
-        if (ls->ls_debug_locks_dentry)
-                debugfs_remove(ls->ls_debug_locks_dentry);
 }
 int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
        struct list_head *list;
        struct dlm_rsb *r;
        int offset = 0, dir_nodeid;
-        uint16_t be_namelen;
+        __be16 be_namelen;
        down_read(&ls->ls_root_sem);
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
                if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
                        /* Write end-of-block record */
-                        be_namelen = 0;
+                        be_namelen = cpu_to_be16(0);
-                        memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                        memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                        offset += sizeof(uint16_t);
+                        offset += sizeof(__be16);
                        goto out;
                }
                be_namelen = cpu_to_be16(r->res_length);
-                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                offset += sizeof(uint16_t);
+                offset += sizeof(__be16);
                memcpy(outbuf + offset, r->res_name, r->res_length);
                offset += r->res_length;
        }
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
        if ((list == &ls->ls_root_list) &&
            (offset + sizeof(uint16_t) <= outlen)) {
-                be_namelen = 0xFFFF;
+                be_namelen = cpu_to_be16(0xFFFF);
-                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                offset += sizeof(uint16_t);
+                offset += sizeof(__be16);
        }
 out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..ef2f1e353966 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,8 @@ struct dlm_lkb {
        struct list_head        lkb_astqueue;   /* need ast to be sent */
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
-        unsigned long           lkb_timestamp;
+        ktime_t                 lkb_time_bast;  /* for debugging */
+        ktime_t                 lkb_timestamp;
        unsigned long           lkb_timeout_cs;
        char                    *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
        struct dentry           *ls_debug_rsb_dentry; /* debugfs */
        struct dentry           *ls_debug_waiters_dentry; /* debugfs */
        struct dentry           *ls_debug_locks_dentry; /* debugfs */
+        struct dentry           *ls_debug_all_dentry; /* debugfs */
        wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
        int                     ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..6cfe65bbf4a2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
        lkb->lkb_lksb->sb_status = rv;
        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-        dlm_add_ast(lkb, AST_COMP);
+        dlm_add_ast(lkb, AST_COMP, 0);
 }
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
+        lkb->lkb_time_bast = ktime_get();
        if (is_master_copy(lkb))
                send_bast(r, lkb, rqmode);
-        else {
+        else
-                lkb->lkb_bastmode = rqmode;
+                dlm_add_ast(lkb, AST_BAST, rqmode);
-                dlm_add_ast(lkb, AST_BAST);
-        }
 }
 /*
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+        lkb->lkb_timestamp = ktime_get();
        lkb->lkb_status = status;
        switch (status) {
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-        if (is_master_copy(lkb)) {
+        if (is_master_copy(lkb))
-                lkb->lkb_timestamp = jiffies;
                return;
-        }
        if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
            !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
        DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
        mutex_lock(&ls->ls_timeout_mutex);
        hold_lkb(lkb);
-        lkb->lkb_timestamp = jiffies;
        list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
        mutex_unlock(&ls->ls_timeout_mutex);
 }
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
        struct dlm_rsb *r;
        struct dlm_lkb *lkb;
        int do_cancel, do_warn;
+        s64 wait_us;
        for (;;) {
                if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
                mutex_lock(&ls->ls_timeout_mutex);
                list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+                        wait_us = ktime_to_us(ktime_sub(ktime_get(),
+                                                        lkb->lkb_timestamp));
                        if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
-                            time_after_eq(jiffies, lkb->lkb_timestamp +
+                            wait_us >= (lkb->lkb_timeout_cs * 10000))
-                                          lkb->lkb_timeout_cs * HZ/100))
                                do_cancel = 1;
                        if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
-                            time_after_eq(jiffies, lkb->lkb_timestamp +
+                            wait_us >= dlm_config.ci_timewarn_cs * 10000)
-                                           dlm_config.ci_timewarn_cs * HZ/100))
                                do_warn = 1;
                        if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 void dlm_adjust_timeouts(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        long adj = jiffies - ls->ls_recover_begin;
+        u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
        ls->ls_recover_begin = 0;
        mutex_lock(&ls->ls_timeout_mutex);
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
-                lkb->lkb_timestamp += adj;
+                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
 }
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
        con->sock->sk->sk_write_space = lowcomms_write_space;
        con->sock->sk->sk_state_change = lowcomms_state_change;
        con->sock->sk->sk_user_data = con;
+        con->sock->sk->sk_allocation = GFP_NOFS;
        return 0;
 }
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
        len = e->len;
        offset = e->offset;
        spin_unlock(&con->writequeue_lock);
-        kmap(e->page);
        /* Send the first block off the write queue */
        iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
                if (e->len == 0 && e->users == 0) {
                        list_del(&e->list);
-                        kunmap(e->page);
                        free_entry(e);
                }
                spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        if (e) {
        got_one:
-                if (users == 0)
-                        kmap(e->page);
                *ppc = page_address(e->page) + offset;
                return e;
        }
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
        if (users)
                goto out;
        e->len = e->end - e->offset;
-        kunmap(e->page);
        spin_unlock(&con->writequeue_lock);
        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
                offset = e->offset;
                BUG_ON(len == 0 && e->users == 0);
                spin_unlock(&con->writequeue_lock);
-                kmap(e->page);
                ret = 0;
                if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
                if (e->len == 0 && e->users == 0) {
                        list_del(&e->list);
-                        kunmap(e->page);
                        free_entry(e);
                        continue;
                }
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
        char *p;
-        p = kzalloc(ls->ls_lvblen, GFP_KERNEL);
+        p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
        return p;
 }
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
-        r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL);
+        r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
        return r;
 }
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL);
+        lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
        return lkb;
 }
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                   ordinary messages). */
                if (msglen > sizeof(__tmp) && p == &__tmp.p) {
-                        p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+                        p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
                        if (p == NULL)
                                return ret;
                }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
        data->status = lkb->lkb_status;
        data->grmode = lkb->lkb_grmode;
        data->rqmode = lkb->lkb_rqmode;
-        data->timestamp = lkb->lkb_timestamp;
        if (lkb->lkb_ua)
                data->xid = lkb->lkb_ua->xid;
        if (r) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
   being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
        ast_type = lkb->lkb_ast_type;
        lkb->lkb_ast_type |= type;
+        if (bastmode)
+                lkb->lkb_bastmode = bastmode;
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
-static void dqput(struct dquot *dquot);
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -415,6 +413,17 @@ out_dqlock:
        return ret;
 }
+void dquot_destroy(struct dquot *dquot)
+{
+        kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+        dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
 /* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-                kmem_cache_free(dquot_cachep, dquot);
+                do_destroy_dquot(dquot);
+        }
+        spin_unlock(&dq_list_lock);
+}
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+                      int (*fn)(struct dquot *dquot, unsigned long priv),
+                      unsigned long priv)
+{
+        struct dquot *dquot, *old_dquot = NULL;
+        int ret = 0;
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        spin_lock(&dq_list_lock);
+        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+                if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+                        continue;
+                if (dquot->dq_sb != sb)
+                        continue;
+                /* Now we have active dquot so we can just increase use count */
+                atomic_inc(&dquot->dq_count);
+                dqstats.lookups++;
+                spin_unlock(&dq_list_lock);
+                dqput(old_dquot);
+                old_dquot = dquot;
+                ret = fn(dquot, priv);
+                if (ret < 0)
+                        goto out;
+                spin_lock(&dq_list_lock);
+                /* We are safe to continue now because our dquot could not
+                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
+out:
+        dqput(old_dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        return ret;
 }
 int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-                if (!sb_has_quota_enabled(sb, cnt))
+                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
+                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
-                        && info_dirty(&dqopt->info[cnt]))
+                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        spin_lock(&dq_list_lock);
        dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-                kmem_cache_free(dquot_cachep, dquot);
+                do_destroy_dquot(dquot);
                count--;
                head = free_dquots.prev;
        }
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
 * NOTE: If you change this function please check whether dqput_blocks() works right...
 * MUST be called with either dqptr_sem or dqonoff_mutex held
 */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
        int ret;
@@ -584,7 +628,7 @@ we_slept:
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
-                if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot->dq_wait_unused);
                spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
        spin_unlock(&dq_list_lock);
 }
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
        struct dquot *dquot;
-        dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NODQUOT;
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 }
 /*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+        unsigned int hashent = hashfn(sb, id, type);
+        int ret = 0;
+        if (!sb_has_quota_active(sb, type))
+                return 0;
+        spin_lock(&dq_list_lock);
+        if (find_dquot(hashent, sb, id, type) != NODQUOT)
+                ret = 1;
+        spin_unlock(&dq_list_lock);
+        return ret;
+}
+/*
 * Get reference to dquot
 * MUST be called with either dqptr_sem or dqonoff_mutex held
 */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
        unsigned int hashent = hashfn(sb, id, type);
        struct dquot *dquot, *empty = NODQUOT;
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
                return NODQUOT;
 we_slept:
        spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
                if (empty)
-                        kmem_cache_free(dquot_cachep, empty);
+                        do_destroy_dquot(empty);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is already
         * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
        }
 }
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
        dquot->dq_dqb.dqb_curinodes += number;
 }
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
        dquot->dq_dqb.dqb_curspace += number;
 }
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
-        if (dquot->dq_dqb.dqb_curinodes > number)
+        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
-        if (dquot->dq_dqb.dqb_curspace > number)
+        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
-        if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+        if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-        if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
        if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-        if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
        if (dquot->dq_dqb.dqb_bhardlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
                        *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (!prealloc) {
                        *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        return QUOTA_OK;
 }
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
                return QUOTA_NL_NOWARN;
        if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-            toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+            dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;
-        if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+        if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
-            dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
-        if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+        if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
-            toqb(dquot->dq_dqb.dqb_curspace - space) <
+            dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
-                                                dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
 }
@@ -1166,17 +1237,23 @@ out_err:
 *      Release all quotas referenced by inode
 *      Transaction must be started at an entry
 */
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
 {
        int cnt;
-        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (inode->i_dquot[cnt] != NODQUOT) {
                        dqput(inode->i_dquot[cnt]);
                        inode->i_dquot[cnt] = NODQUOT;
                }
        }
+        return 0;
+}
+int dquot_drop(struct inode *inode)
+{
+        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        dquot_drop_locked(inode);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return 0;
 }
@@ -1264,7 +1341,7 @@ warn_put_all:
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
        int cnt, ret = NO_QUOTA;
        char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
-        if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
                vfs_dq_init(inode);
                if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
                        return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
        .acquire_dquot  = dquot_acquire,
        .release_dquot  = dquot_release,
        .mark_dirty     = dquot_mark_dquot_dirty,
-        .write_info     = dquot_commit_info
+        .write_info     = dquot_commit_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-        switch (type) {
-                case USRQUOTA:
-                        dqopt->flags |= DQUOT_USR_ENABLED;
-                        dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                        break;
-                case GRPQUOTA:
-                        dqopt->flags |= DQUOT_GRP_ENABLED;
-                        dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                        break;
-        }
-}
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-                                      int remount)
-{
-        switch (type) {
-                case USRQUOTA:
-                        dqopt->flags &= ~DQUOT_USR_ENABLED;
-                        if (remount)
-                                dqopt->flags |= DQUOT_USR_SUSPENDED;
-                        else
-                                dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                        break;
-                case GRPQUOTA:
-                        dqopt->flags &= ~DQUOT_GRP_ENABLED;
-                        if (remount)
-                                dqopt->flags |= DQUOT_GRP_SUSPENDED;
-                        else
-                                dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                        break;
-        }
-}
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *toputinode[MAXQUOTAS];
+        /* Cannot turn off usage accounting without turning off limits, or
+         * suspend quotas and simultaneously turn quotas off. */
+        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+            DQUOT_USAGE_ENABLED)))
+                return -EINVAL;
        /* We need to serialize quota_off() for device */
        mutex_lock(&dqopt->dqonoff_mutex);
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
-        if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+        if (!sb_any_quota_loaded(sb)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                toputinode[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
-                /* If we keep inodes of quota files after remount and quotaoff
+                if (!sb_has_quota_loaded(sb, cnt))
-                 * is called, drop kept inodes. */
-                if (!remount && sb_has_quota_suspended(sb, cnt)) {
-                        iput(dqopt->files[cnt]);
-                        dqopt->files[cnt] = NULL;
-                        reset_enable_flags(dqopt, cnt, 0);
                        continue;
+                if (flags & DQUOT_SUSPENDED) {
+                        dqopt->flags |=
+                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
+                } else {
+                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
+                        /* Turning off suspended quotas? */
+                        if (!sb_has_quota_loaded(sb, cnt) &&
+                            sb_has_quota_suspended(sb, cnt)) {
+                                dqopt->flags &= ~dquot_state_flag(
+                                                        DQUOT_SUSPENDED, cnt);
+                                iput(dqopt->files[cnt]);
+                                dqopt->files[cnt] = NULL;
+                                continue;
+                        }
                }
-                if (!sb_has_quota_enabled(sb, cnt))
+                /* We still have to keep quota loaded? */
+                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;
-                reset_enable_flags(dqopt, cnt, remount);
                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                put_quota_format(dqopt->info[cnt].dqi_format);
                toputinode[cnt] = dqopt->files[cnt];
-                if (!remount)
+                if (!sb_has_quota_loaded(sb, cnt))
                        dqopt->files[cnt] = NULL;
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                dqopt->ops[cnt] = NULL;
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
+        /* Skip syncing and setting flags if quota files are hidden */
+        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+                goto put_inodes;
        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                        mutex_lock(&dqopt->dqonoff_mutex);
                        /* If quota was reenabled in the meantime, we have
                         * nothing to do */
-                        if (!sb_has_quota_enabled(sb, cnt)) {
+                        if (!sb_has_quota_loaded(sb, cnt)) {
                                mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
                                toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
                                  S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                                mark_inode_dirty(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
+                }
+        if (sb->s_bdev)
+                invalidate_bdev(sb->s_bdev);
+put_inodes:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (toputinode[cnt]) {
                        /* On remount RO, we keep the inode pointer so that we
-                         * can reenable quota on the subsequent remount RW.
+                         * can reenable quota on the subsequent remount RW. We
-                         * But we have better not keep inode pointer when there
+                         * have to check 'flags' variable and not use sb_has_
-                         * is pending delete on the quota file... */
+                         * function because another quotaon / quotaoff could
-                        if (!remount)
+                         * change global state before we got here. We refuse
+                         * to suspend quotas when there is pending delete on
+                         * the quota file... */
+                        if (!(flags & DQUOT_SUSPENDED))
                                iput(toputinode[cnt]);
                        else if (!toputinode[cnt]->i_nlink)
                                ret = -EBUSY;
                }
-        if (sb->s_bdev)
-                invalidate_bdev(sb->s_bdev);
        return ret;
 }
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
 /*
 *      Turn quotas on on a device
 */
-/* Helper function when we already have the inode */
+/*
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+        unsigned int flags)
 {
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
                error = -EINVAL;
                goto out_fmt;
        }
+        /* Usage always has to be set... */
+        if (!(flags & DQUOT_USAGE_ENABLED)) {
+                error = -EINVAL;
+                goto out_fmt;
+        }
-        /* As we bypass the pagecache we must now flush the inode so that
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-         * we see all the changes from userspace... */
+                /* As we bypass the pagecache we must now flush the inode so
-        write_inode_now(inode, 1);
+                 * that we see all the changes from userspace... */
-        /* And now flush the block cache so that kernel sees the changes */
+                write_inode_now(inode, 1);
-        invalidate_bdev(sb->s_bdev);
+                /* And now flush the block cache so that kernel sees the
+                 * changes */
+                invalidate_bdev(sb->s_bdev);
+        }
        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
-        if (sb_has_quota_enabled(sb, type) ||
+        if (sb_has_quota_loaded(sb, type)) {
-                        sb_has_quota_suspended(sb, type)) {
                error = -EBUSY;
                goto out_lock;
        }
-        /* We don't want quota and atime on quota files (deadlocks possible)
-         * Also nobody should write to the file - we use special IO operations
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-         * which ignore the immutable bit. */
+                /* We don't want quota and atime on quota files (deadlocks
-        down_write(&dqopt->dqptr_sem);
+                 * possible) Also nobody should write to the file - we use
-        oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+                 * special IO operations which ignore the immutable bit. */
-        inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                down_write(&dqopt->dqptr_sem);
-        up_write(&dqopt->dqptr_sem);
+                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-        sb->dq_op->drop(inode);
+                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                up_write(&dqopt->dqptr_sem);
+                sb->dq_op->drop(inode);
+        }
        error = -EIO;
        dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
        }
        mutex_unlock(&dqopt->dqio_mutex);
        mutex_unlock(&inode->i_mutex);
-        set_enable_flags(dqopt, type);
+        dqopt->flags |= dquot_state_flag(flags, type);
        add_dquot_ref(sb, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
        int ret;
+        unsigned int flags;
        mutex_lock(&dqopt->dqonoff_mutex);
        if (!sb_has_quota_suspended(sb, type)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
-        BUG_ON(sb_has_quota_enabled(sb, type));
        inode = dqopt->files[type];
        dqopt->files[type] = NULL;
-        reset_enable_flags(dqopt, type, 0);
+        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                DQUOT_LIMITS_ENABLED, type);
+        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
-        ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+        flags = dquot_generic_flag(flags, type);
+        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                                   flags);
        iput(inode);
        return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
        if (path->mnt->mnt_sb != sb)
                error = -EXDEV;
        else
-                error = vfs_quota_on_inode(path->dentry->d_inode, type,
+                error = vfs_load_quota_inode(path->dentry->d_inode, type,
-                                           format_id);
+                                             format_id, DQUOT_USAGE_ENABLED |
+                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
                 int remount)
 {
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 }
 /*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+                unsigned int flags)
+{
+        int ret = 0;
+        struct super_block *sb = inode->i_sb;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        /* Just unsuspend quotas? */
+        if (flags & DQUOT_SUSPENDED)
+                return vfs_quota_on_remount(sb, type);
+        if (!flags)
+                return 0;
+        /* Just updating flags needed? */
+        if (sb_has_quota_loaded(sb, type)) {
+                mutex_lock(&dqopt->dqonoff_mutex);
+                /* Now do a reliable test... */
+                if (!sb_has_quota_loaded(sb, type)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        goto load_quota;
+                }
+                if (flags & DQUOT_USAGE_ENABLED &&
+                    sb_has_quota_usage_enabled(sb, type)) {
+                        ret = -EBUSY;
+                        goto out_lock;
+                }
+                if (flags & DQUOT_LIMITS_ENABLED &&
+                    sb_has_quota_limits_enabled(sb, type)) {
+                        ret = -EBUSY;
+                        goto out_lock;
+                }
+                sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+                mutex_unlock(&dqopt->dqonoff_mutex);
+                return ret;
+        }
+load_quota:
+        return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
        error = security_quota_on(dentry);
        if (!error)
-                error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+                error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 out:
        dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
        return ret;
 }
+static inline qsize_t qbtos(qsize_t blocks)
+{
+        return blocks << QIF_DQBLKSIZE_BITS;
+}
+static inline qsize_t stoqb(qsize_t space)
+{
+        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        spin_lock(&dq_data_lock);
-        di->dqb_bhardlimit = dm->dqb_bhardlimit;
+        di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
-        di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+        di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
        di->dqb_curspace = dm->dqb_curspace;
        di->dqb_ihardlimit = dm->dqb_ihardlimit;
        di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        if (di->dqb_valid & QIF_SPACE) {
                dm->dqb_curspace = di->dqb_curspace;
                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_BLIMITS) {
-                dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
+                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
-                dm->dqb_bhardlimit = di->dqb_bhardlimit;
+                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_INODES) {
                dm->dqb_curinodes = di->dqb_curinodes;
                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ILIMITS) {
                dm->dqb_isoftlimit = di->dqb_isoftlimit;
                dm->dqb_ihardlimit = di->dqb_ihardlimit;
                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BTIME)
+        if (di->dqb_valid & QIF_BTIME) {
                dm->dqb_btime = di->dqb_btime;
-        if (di->dqb_valid & QIF_ITIME)
+                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        }
+        if (di->dqb_valid & QIF_ITIME) {
                dm->dqb_itime = di->dqb_itime;
+                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        }
        if (check_blim) {
-                if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+                if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                }
@@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
        int rc;
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!(dquot = dqget(sb, id, type))) {
+        dquot = dqget(sb, id, type);
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        if (!dquot) {
-                return -ESRCH;
+                rc = -ESRCH;
+                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return rc;
 }
@@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        struct mem_dqinfo *mi;
  
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!sb_has_quota_enabled(sb, type)) {
+        if (!sb_has_quota_active(sb, type)) {
                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
        }
@@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
+        int err = 0;
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!sb_has_quota_enabled(sb, type)) {
+        if (!sb_has_quota_active(sb, type)) {
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+                err = -ESRCH;
-                return -ESRCH;
+                goto out;
        }
        mi = sb_dqopt(sb)->info + type;
        spin_lock(&dq_data_lock);
@@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mark_info_dirty(sb, type);
        /* Force write to disk */
        sb->dq_op->write_info(sb, type);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-        return 0;
+        return err;
 }
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
 *
 * Returns zero on success; non-zero on error.
 */
-static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
-                              loff_t offset)
+                       loff_t offset)
 {
        int rc = 0;
        char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
                crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+                crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+                if (mount_crypt_stat->flags
+                    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+                        crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+                else if (mount_crypt_stat->flags
+                         & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+                        crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+        }
 }
 static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
 static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
        {0x00000001, ECRYPTFS_ENABLE_HMAC},
        {0x00000002, ECRYPTFS_ENCRYPTED},
-        {0x00000004, ECRYPTFS_METADATA_IN_XATTR}
+        {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+        {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
 };
 /**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
 /**
 * ecryptfs_code_for_cipher_string
- * @crypt_stat: The cryptographic context
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
 *
 * Returns zero on no match, or the cipher code on match
 */
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
 {
        int i;
        u8 code = 0;
        struct ecryptfs_cipher_code_str_map_elem *map =
                ecryptfs_cipher_code_str_map;
-        if (strcmp(crypt_stat->cipher, "aes") == 0) {
+        if (strcmp(cipher_name, "aes") == 0) {
-                switch (crypt_stat->key_size) {
+                switch (key_bytes) {
                case 16:
                        code = RFC2440_CIPHER_AES_128;
                        break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
                }
        } else {
                for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
-                        if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+                        if (strcmp(cipher_name, map[i].cipher_str) == 0) {
                                code = map[i].cipher_code;
                                break;
                        }
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
        int rc;
+        if (crypt_stat->extent_size == 0)
+                crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
        rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
                                 ecryptfs_inode);
        if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
        }
        if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
                rc = -EINVAL;
-                ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
        }
 out:
        return rc;
@@ -1628,95 +1640,95 @@ out:
 }
 /**
- * ecryptfs_encode_filename - converts a plaintext file name to cipher text
+ * ecryptfs_encrypt_filename - encrypt filename
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
- * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
 *
- * Encrypts and encodes a filename into something that constitutes a
+ * CBC-encrypts the filename. We do not want to encrypt the same
- * valid filename for a filesystem, with printable characters.
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
 *
- * We assume that we have a properly initialized crypto context,
+ * Returns zero on success; non-zero otherwise
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of encoded filename; negative if error
 */
-int
+static int
-ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
-                         const char *name, int length, char **encoded_name)
+                          struct ecryptfs_crypt_stat *crypt_stat,
+                          struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
-        int error = 0;
+        int rc = 0;
-        (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
+        filename->encrypted_filename = NULL;
-        if (!(*encoded_name)) {
+        filename->encrypted_filename_size = 0;
-                error = -ENOMEM;
+        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                size_t packet_size;
+                size_t remaining_bytes;
+                rc = ecryptfs_write_tag_70_packet(
+                        NULL, NULL,
+                        &filename->encrypted_filename_size,
+                        mount_crypt_stat, NULL,
+                        filename->filename_size);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to get packet "
+                               "size for tag 72; rc = [%d]\n", __func__,
+                               rc);
+                        filename->encrypted_filename_size = 0;
+                        goto out;
+                }
+                filename->encrypted_filename =
+                        kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+                if (!filename->encrypted_filename) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kmalloc [%zd] bytes\n", __func__,
+                               filename->encrypted_filename_size);
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                remaining_bytes = filename->encrypted_filename_size;
+                rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+                                                  &remaining_bytes,
+                                                  &packet_size,
+                                                  mount_crypt_stat,
+                                                  filename->filename,
+                                                  filename->filename_size);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to generate "
+                               "tag 70 packet; rc = [%d]\n", __func__,
+                               rc);
+                        kfree(filename->encrypted_filename);
+                        filename->encrypted_filename = NULL;
+                        filename->encrypted_filename_size = 0;
+                        goto out;
+                }
+                filename->encrypted_filename_size = packet_size;
+        } else {
+                printk(KERN_ERR "%s: No support for requested filename "
+                       "encryption method in this release\n", __func__);
+                rc = -ENOTSUPP;
                goto out;
        }
-        /* TODO: Filename encryption is a scheduled feature for a
-         * future version of eCryptfs. This function is here only for
-         * the purpose of providing a framework for other developers
-         * to easily implement filename encryption. Hint: Replace this
-         * memcpy() with a call to encrypt and encode the
-         * filename, the set the length accordingly. */
-        memcpy((void *)(*encoded_name), (void *)name, length);
-        (*encoded_name)[length] = '\0';
-        error = length + 1;
 out:
-        return error;
+        return rc;
 }
-/**
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
- * ecryptfs_decode_filename - converts the cipher text name to plaintext
+                                  const char *name, size_t name_size)
- * @crypt_stat: The crypt_stat struct associated with the file
- * @name: The filename in cipher text
- * @length: The length of the cipher text name
- * @decrypted_name: The plaintext name
- *
- * Decodes and decrypts the filename.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of decoded filename; negative if error
- */
-int
-ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-                         const char *name, int length, char **decrypted_name)
 {
-        int error = 0;
+        int rc = 0;
-        (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
+        (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
-        if (!(*decrypted_name)) {
+        if (!(*copied_name)) {
-                error = -ENOMEM;
+                rc = -ENOMEM;
                goto out;
        }
-        /* TODO: Filename encryption is a scheduled feature for a
+        memcpy((void *)(*copied_name), (void *)name, name_size);
-         * future version of eCryptfs. This function is here only for
+        (*copied_name)[(name_size)] = '\0';     /* Only for convenience
-         * the purpose of providing a framework for other developers
-         * to easily implement filename encryption. Hint: Replace this
-         * memcpy() with a call to decode and decrypt the
-         * filename, the set the length accordingly. */
-        memcpy((void *)(*decrypted_name), (void *)name, length);
-        (*decrypted_name)[length + 1] = '\0';   /* Only for convenience
                                                 * in printing out the
                                                 * string in debug
                                                 * messages */
-        error = length;
+        (*copied_name_size) = (name_size + 1);
 out:
-        return error;
+        return rc;
 }
 /**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        *key_tfm = NULL;
        if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
                rc = -EINVAL;
-                printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+                printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
                      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
                goto out;
        }
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        get_random_bytes(dummy_key, *key_size);
        rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
        if (rc) {
-                printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+                printk(KERN_ERR "Error attempting to set key of size [%zd] for "
                       "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
                rc = -EINVAL;
                goto out;
@@ -1910,3 +1922,341 @@ out:
        mutex_unlock(&key_tfm_list_mutex);
        return rc;
 }
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+                                                 "EFGHIJKLMNOPQRST"
+                                                 "UVWXYZabcdefghij"
+                                                 "klmnopqrstuvwxyz");
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+        0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+        0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+        0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+        0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+        0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+        0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+        0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+        0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+        0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+        0x3D, 0x3E, 0x3F
+};
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+                                  unsigned char *src, size_t src_size)
+{
+        size_t num_blocks;
+        size_t block_num = 0;
+        size_t dst_offset = 0;
+        unsigned char last_block[3];
+        if (src_size == 0) {
+                (*dst_size) = 0;
+                goto out;
+        }
+        num_blocks = (src_size / 3);
+        if ((src_size % 3) == 0) {
+                memcpy(last_block, (&src[src_size - 3]), 3);
+        } else {
+                num_blocks++;
+                last_block[2] = 0x00;
+                switch (src_size % 3) {
+                case 1:
+                        last_block[0] = src[src_size - 1];
+                        last_block[1] = 0x00;
+                        break;
+                case 2:
+                        last_block[0] = src[src_size - 2];
+                        last_block[1] = src[src_size - 1];
+                }
+        }
+        (*dst_size) = (num_blocks * 4);
+        if (!dst)
+                goto out;
+        while (block_num < num_blocks) {
+                unsigned char *src_block;
+                unsigned char dst_block[4];
+                if (block_num == (num_blocks - 1))
+                        src_block = last_block;
+                else
+                        src_block = &src[block_num * 3];
+                dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+                dst_block[1] = (((src_block[0] << 4) & 0x30)
+                                | ((src_block[1] >> 4) & 0x0F));
+                dst_block[2] = (((src_block[1] << 2) & 0x3C)
+                                | ((src_block[2] >> 6) & 0x03));
+                dst_block[3] = (src_block[2] & 0x3F);
+                dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+                block_num++;
+        }
+out:
+        return;
+}
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+                              const unsigned char *src, size_t src_size)
+{
+        u8 current_bit_offset = 0;
+        size_t src_byte_offset = 0;
+        size_t dst_byte_offset = 0;
+        if (dst == NULL) {
+                /* Not exact; conservatively long. Every block of 4
+                 * encoded characters decodes into a block of 3
+                 * decoded characters. This segment of code provides
+                 * the caller with the maximum amount of allocated
+                 * space that @dst will need to point to in a
+                 * subsequent call. */
+                (*dst_size) = (((src_size + 1) * 3) / 4);
+                goto out;
+        }
+        while (src_byte_offset < src_size) {
+                unsigned char src_byte =
+                                filename_rev_map[(int)src[src_byte_offset]];
+                switch (current_bit_offset) {
+                case 0:
+                        dst[dst_byte_offset] = (src_byte << 2);
+                        current_bit_offset = 6;
+                        break;
+                case 6:
+                        dst[dst_byte_offset++] |= (src_byte >> 4);
+                        dst[dst_byte_offset] = ((src_byte & 0xF)
+                                                 << 4);
+                        current_bit_offset = 4;
+                        break;
+                case 4:
+                        dst[dst_byte_offset++] |= (src_byte >> 2);
+                        dst[dst_byte_offset] = (src_byte << 6);
+                        current_bit_offset = 2;
+                        break;
+                case 2:
+                        dst[dst_byte_offset++] |= (src_byte);
+                        dst[dst_byte_offset] = 0;
+                        current_bit_offset = 0;
+                        break;
+                }
+                src_byte_offset++;
+        }
+        (*dst_size) = dst_byte_offset;
+out:
+        return;
+}
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+        char **encoded_name,
+        size_t *encoded_name_size,
+        struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+        const char *name, size_t name_size)
+{
+        size_t encoded_name_no_prefix_size;
+        int rc = 0;
+        (*encoded_name) = NULL;
+        (*encoded_name_size) = 0;
+        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+                struct ecryptfs_filename *filename;
+                filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+                if (!filename) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kzalloc [%zd] bytes\n", __func__,
+                               sizeof(*filename));
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                filename->filename = (char *)name;
+                filename->filename_size = name_size;
+                rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+                                               mount_crypt_stat);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to encrypt "
+                               "filename; rc = [%d]\n", __func__, rc);
+                        kfree(filename);
+                        goto out;
+                }
+                ecryptfs_encode_for_filename(
+                        NULL, &encoded_name_no_prefix_size,
+                        filename->encrypted_filename,
+                        filename->encrypted_filename_size);
+                if ((crypt_stat && (crypt_stat->flags
+                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+                    || (mount_crypt_stat
+                        && (mount_crypt_stat->flags
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                else
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+                if (!(*encoded_name)) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kzalloc [%zd] bytes\n", __func__,
+                               (*encoded_name_size));
+                        rc = -ENOMEM;
+                        kfree(filename->encrypted_filename);
+                        kfree(filename);
+                        goto out;
+                }
+                if ((crypt_stat && (crypt_stat->flags
+                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+                    || (mount_crypt_stat
+                        && (mount_crypt_stat->flags
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                        memcpy((*encoded_name),
+                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+                        ecryptfs_encode_for_filename(
+                            ((*encoded_name)
+                             + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+                            &encoded_name_no_prefix_size,
+                            filename->encrypted_filename,
+                            filename->encrypted_filename_size);
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                        (*encoded_name)[(*encoded_name_size)] = '\0';
+                        (*encoded_name_size)++;
+                } else {
+                        rc = -ENOTSUPP;
+                }
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to encode "
+                               "encrypted filename; rc = [%d]\n", __func__,
+                               rc);
+                        kfree((*encoded_name));
+                        (*encoded_name) = NULL;
+                        (*encoded_name_size) = 0;
+                }
+                kfree(filename->encrypted_filename);
+                kfree(filename);
+        } else {
+                rc = ecryptfs_copy_filename(encoded_name,
+                                            encoded_name_size,
+                                            name, name_size);
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+                                         size_t *plaintext_name_size,
+                                         struct dentry *ecryptfs_dir_dentry,
+                                         const char *name, size_t name_size)
+{
+        char *decoded_name;
+        size_t decoded_name_size;
+        size_t packet_size;
+        int rc = 0;
+        if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+            && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+                        ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+                struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+                        &ecryptfs_superblock_to_private(
+                                ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+                const char *orig_name = name;
+                size_t orig_name_size = name_size;
+                name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+                name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+                ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+                                              name, name_size);
+                decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+                if (!decoded_name) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kmalloc [%zd] bytes\n", __func__,
+                               decoded_name_size);
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+                                              name, name_size);
+                rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+                                                  plaintext_name_size,
+                                                  &packet_size,
+                                                  mount_crypt_stat,
+                                                  decoded_name,
+                                                  decoded_name_size);
+                if (rc) {
+                        printk(KERN_INFO "%s: Could not parse tag 70 packet "
+                               "from filename; copying through filename "
+                               "as-is\n", __func__);
+                        rc = ecryptfs_copy_filename(plaintext_name,
+                                                    plaintext_name_size,
+                                                    orig_name, orig_name_size);
+                        goto out_free;
+                }
+        } else {
+                rc = ecryptfs_copy_filename(plaintext_name,
+                                            plaintext_name_size,
+                                            name, name_size);
+                goto out;
+        }
+out_free:
+        kfree(decoded_name);
+out:
+        return rc;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
 #define ECRYPTFS_VERSIONING_XATTR                 0x00000010
 #define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
 #define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
+#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
+#define ECRYPTFS_VERSIONING_GCM                   0x00000200
 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
                                  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
                                  | ECRYPTFS_VERSIONING_PUBKEY \
                                  | ECRYPTFS_VERSIONING_XATTR \
                                  | ECRYPTFS_VERSIONING_MULTKEY \
-                                  | ECRYPTFS_VERSIONING_DEVMISC)
+                                  | ECRYPTFS_VERSIONING_DEVMISC \
+                                  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
 #define ECRYPTFS_MAX_PASSWORD_LENGTH 64
 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
 #define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
 #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
 #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+                                          * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+                                          * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+                                          * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+                                          * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
 #define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
 struct ecryptfs_key_sig {
        struct list_head crypt_stat_list;
        char keysig[ECRYPTFS_SIG_SIZE_HEX];
 };
+struct ecryptfs_filename {
+        struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+        u32 flags;
+        u32 seq_no;
+        char *filename;
+        char *encrypted_filename;
+        size_t filename_size;
+        size_t encrypted_filename_size;
+        char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+        char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
 /**
 * This is the primary struct associated with each encrypted file.
 *
 * TODO: cache align/pack?
 */
 struct ecryptfs_crypt_stat {
-#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
-#define ECRYPTFS_POLICY_APPLIED     0x00000002
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
-#define ECRYPTFS_NEW_FILE           0x00000004
+#define ECRYPTFS_NEW_FILE             0x00000004
-#define ECRYPTFS_ENCRYPTED          0x00000008
+#define ECRYPTFS_ENCRYPTED            0x00000008
-#define ECRYPTFS_SECURITY_WARNING   0x00000010
+#define ECRYPTFS_SECURITY_WARNING     0x00000010
-#define ECRYPTFS_ENABLE_HMAC        0x00000020
+#define ECRYPTFS_ENABLE_HMAC          0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000040
-#define ECRYPTFS_KEY_VALID          0x00000080
+#define ECRYPTFS_KEY_VALID            0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR  0x00000100
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000200
-#define ECRYPTFS_KEY_SET            0x00000400
+#define ECRYPTFS_KEY_SET              0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK        0x00002000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
        size_t num_global_auth_toks;
        size_t global_default_cipher_key_size;
+        size_t global_default_fn_cipher_key_bytes;
        unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
                                                 + 1];
+        unsigned char global_default_fn_cipher_name[
+                ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+        char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
 /* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
 int ecryptfs_interpose(struct dentry *hidden_dentry,
                       struct dentry *this_dentry, struct super_block *sb,
                       u32 flags);
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+                                        struct dentry *lower_dentry,
+                                        struct ecryptfs_crypt_stat *crypt_stat,
+                                        struct inode *ecryptfs_dir_inode,
+                                        struct nameidata *ecryptfs_nd);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+                                         size_t *decrypted_name_size,
+                                         struct dentry *ecryptfs_dentry,
+                                         const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
-int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+int ecryptfs_encrypt_and_encode_filename(
-                             const char *name, int length,
+        char **encoded_name,
-                             char **decrypted_name);
+        size_t *encoded_name_size,
-int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_crypt_stat *crypt_stat,
-                             const char *name, int length,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-                             char **encoded_name);
+        const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
 void ecryptfs_dump_hex(char *data, int bytes);
 int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
                                             struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
                                            struct dentry *ecryptfs_dentry);
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
 int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
                             struct vfsmount *lower_mnt,
                             const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+                       loff_t offset);
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
 /* Inspired by generic filldir in fs/readdir.c */
 static int
-ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
-                 u64 ino, unsigned int d_type)
+                 loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct ecryptfs_crypt_stat *crypt_stat;
        struct ecryptfs_getdents_callback *buf =
            (struct ecryptfs_getdents_callback *)dirent;
+        size_t name_size;
+        char *name;
        int rc;
-        int decoded_length;
-        char *decoded_name;
-        crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
        buf->filldir_called++;
-        decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
+        rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
-                                                  &decoded_name);
+                                                  buf->dentry, lower_name,
-        if (decoded_length < 0) {
+                                                  lower_namelen);
-                rc = decoded_length;
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+                       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+                       rc);
                goto out;
        }
-        rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
+        rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
-                          ino, d_type);
+        kfree(name);
-        kfree(decoded_name);
        if (rc >= 0)
                buf->entries_written++;
 out:
@@ -106,8 +106,8 @@ out:
 /**
 * ecryptfs_readdir
- * @file: The ecryptfs file struct
+ * @file: The eCryptfs directory file
- * @dirent: Directory entry
+ * @dirent: Directory entry handle
 * @filldir: The filldir callback function
 */
 static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        struct file *lower_file = ecryptfs_file_to_lower(file);
+        return vfs_fsync(ecryptfs_file_to_lower(file),
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+                         ecryptfs_dentry_to_lower(dentry),
-        struct inode *lower_inode = lower_dentry->d_inode;
+                         datasync);
-        int rc = -EINVAL;
-        if (lower_inode->i_fop->fsync) {
-                mutex_lock(&lower_inode->i_mutex);
-                rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
-                                               datasync);
-                mutex_unlock(&lower_inode->i_mutex);
-        }
-        return rc;
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5e78fc179886..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
 /**
 * ecryptfs_create_underlying_file
 * @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @lower_dentry: New file's dentry in the lower fs
+ * @dentry: New file's dentry
- * @ecryptfs_dentry: New file's dentry in ecryptfs
 * @mode: The mode of the new file
 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
 *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 {
        int rc;
-        /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens
+        /* ecryptfs_do_create() calls ecryptfs_interpose() */
-         * the crypt_stat->lower_file (persistent file) */
        rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
        if (unlikely(rc)) {
                ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
 }
 /**
- * ecryptfs_lookup
+ * ecryptfs_lookup_and_interpose_lower - Perform a lookup
- * @dir: inode
- * @dentry: The dentry
- * @nd: nameidata, may be NULL
- *
- * Find a file on disk. If the file does not exist, then we'll add it to the
- * dentry cache and continue on to read it from the disk.
 */
-static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
-                                      struct nameidata *nd)
+                                        struct dentry *lower_dentry,
+                                        struct ecryptfs_crypt_stat *crypt_stat,
+                                        struct inode *ecryptfs_dir_inode,
+                                        struct nameidata *ecryptfs_nd)
 {
-        int rc = 0;
        struct dentry *lower_dir_dentry;
-        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
-        char *encoded_name;
+        struct inode *lower_inode;
-        int encoded_namelen;
-        struct ecryptfs_crypt_stat *crypt_stat = NULL;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        char *page_virt = NULL;
-        struct inode *lower_inode;
        u64 file_size;
+        int rc = 0;
-        lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+        lower_dir_dentry = lower_dentry->d_parent;
-        dentry->d_op = &ecryptfs_dops;
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-        if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
+                                   ecryptfs_dentry->d_parent));
-            || (dentry->d_name.len == 2
-                && !strcmp(dentry->d_name.name, ".."))) {
-                d_drop(dentry);
-                goto out;
-        }
-        encoded_namelen = ecryptfs_encode_filename(crypt_stat,
-                                                   dentry->d_name.name,
-                                                   dentry->d_name.len,
-                                                   &encoded_name);
-        if (encoded_namelen < 0) {
-                rc = encoded_namelen;
-                d_drop(dentry);
-                goto out;
-        }
-        ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
-                        "= [%d]\n", encoded_name, encoded_namelen);
-        lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
-                                      encoded_namelen - 1);
-        kfree(encoded_name);
-        if (IS_ERR(lower_dentry)) {
-                ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
-                rc = PTR_ERR(lower_dentry);
-                d_drop(dentry);
-                goto out;
-        }
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-        ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
-                "d_name.name = [%s]\n", lower_dentry,
-                lower_dentry->d_name.name);
        lower_inode = lower_dentry->d_inode;
-        fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
        BUG_ON(!atomic_read(&lower_dentry->d_count));
-        ecryptfs_set_dentry_private(dentry,
+        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
-        if (!ecryptfs_dentry_to_private(dentry)) {
+        if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
                rc = -ENOMEM;
-                ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
+                printk(KERN_ERR "%s: Out of memory whilst attempting "
-                                "to allocate ecryptfs_dentry_info struct\n");
+                       "to allocate ecryptfs_dentry_info struct\n",
+                        __func__);
                goto out_dput;
        }
-        ecryptfs_set_dentry_lower(dentry, lower_dentry);
+        ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
-        ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+        ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
        if (!lower_dentry->d_inode) {
                /* We want to add because we couldn't find in lower */
-                d_add(dentry, NULL);
+                d_add(ecryptfs_dentry, NULL);
                goto out;
        }
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
+                                ecryptfs_dir_inode->i_sb, 1);
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error interposing\n");
+                printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
+                       __func__, rc);
                goto out;
        }
-        if (S_ISDIR(lower_inode->i_mode)) {
+        if (S_ISDIR(lower_inode->i_mode))
-                ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
                goto out;
-        }
+        if (S_ISLNK(lower_inode->i_mode))
-        if (S_ISLNK(lower_inode->i_mode)) {
-                ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
                goto out;
-        }
+        if (special_file(lower_inode->i_mode))
-        if (special_file(lower_inode->i_mode)) {
-                ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
                goto out;
-        }
+        if (!ecryptfs_nd)
-        if (!nd) {
-                ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
-                                "as we *think* we are about to unlink\n");
                goto out;
-        }
        /* Released in this function */
-        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2,
+        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
-                                      GFP_USER);
        if (!page_virt) {
+                printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
+                       __func__);
                rc = -ENOMEM;
-                ecryptfs_printk(KERN_ERR,
-                                "Cannot ecryptfs_kmalloc a page\n");
                goto out;
        }
-        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                ecryptfs_set_default_sizes(crypt_stat);
-        if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
-                rc = ecryptfs_init_persistent_file(dentry);
                if (rc) {
                        printk(KERN_ERR "%s: Error attempting to initialize "
                               "the persistent file for the dentry with name "
                               "[%s]; rc = [%d]\n", __func__,
-                               dentry->d_name.name, rc);
+                               ecryptfs_dentry->d_name.name, rc);
-                        goto out;
+                        goto out_free_kmem;
                }
        }
        rc = ecryptfs_read_and_validate_header_region(page_virt,
-                                                      dentry->d_inode);
+                                                      ecryptfs_dentry->d_inode);
        if (rc) {
-                rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry);
+                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
+                                                             ecryptfs_dentry);
                if (rc) {
-                        printk(KERN_DEBUG "Valid metadata not found in header "
-                               "region or xattr region; treating file as "
-                               "unencrypted\n");
                        rc = 0;
-                        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+                        goto out_free_kmem;
-                        goto out;
                }
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        }
        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                dentry->d_sb)->mount_crypt_stat;
+                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
                        file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
        } else {
                file_size = get_unaligned_be64(page_virt);
        }
-        i_size_write(dentry->d_inode, (loff_t)file_size);
+        i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
+out_free_kmem:
        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
        goto out;
 out_dput:
        dput(lower_dentry);
-        d_drop(dentry);
+        d_drop(ecryptfs_dentry);
+out:
+        return rc;
+}
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+                                      struct dentry *ecryptfs_dentry,
+                                      struct nameidata *ecryptfs_nd)
+{
+        char *encrypted_and_encoded_name = NULL;
+        size_t encrypted_and_encoded_name_size;
+        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+        struct ecryptfs_inode_info *inode_info;
+        struct dentry *lower_dir_dentry, *lower_dentry;
+        int rc = 0;
+        ecryptfs_dentry->d_op = &ecryptfs_dops;
+        if ((ecryptfs_dentry->d_name.len == 1
+             && !strcmp(ecryptfs_dentry->d_name.name, "."))
+            || (ecryptfs_dentry->d_name.len == 2
+                && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+                goto out_d_drop;
+        }
+        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+                                      lower_dir_dentry,
+                                      ecryptfs_dentry->d_name.len);
+        if (IS_ERR(lower_dentry)) {
+                rc = PTR_ERR(lower_dentry);
+                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                       "lower_dentry = [%s]\n", __func__, rc,
+                       ecryptfs_dentry->d_name.name);
+                goto out_d_drop;
+        }
+        if (lower_dentry->d_inode)
+                goto lookup_and_interpose;
+        inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+        if (inode_info) {
+                crypt_stat = &inode_info->crypt_stat;
+                /* TODO: lock for crypt_stat comparison */
+                if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                        ecryptfs_set_default_sizes(crypt_stat);
+        }
+        if (crypt_stat)
+                mount_crypt_stat = crypt_stat->mount_crypt_stat;
+        else
+                mount_crypt_stat = &ecryptfs_superblock_to_private(
+                        ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+            && !(mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+                goto lookup_and_interpose;
+        dput(lower_dentry);
+        rc = ecryptfs_encrypt_and_encode_filename(
+                &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+                crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+                ecryptfs_dentry->d_name.len);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+                       "filename; rc = [%d]\n", __func__, rc);
+                goto out_d_drop;
+        }
+        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+                                      lower_dir_dentry,
+                                      encrypted_and_encoded_name_size - 1);
+        if (IS_ERR(lower_dentry)) {
+                rc = PTR_ERR(lower_dentry);
+                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                       "lower_dentry = [%s]\n", __func__, rc,
+                       encrypted_and_encoded_name);
+                goto out_d_drop;
+        }
+lookup_and_interpose:
+        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+                                                 crypt_stat, ecryptfs_dir_inode,
+                                                 ecryptfs_nd);
+        goto out;
+out_d_drop:
+        d_drop(ecryptfs_dentry);
 out:
+        kfree(encrypted_and_encoded_name);
        return ERR_PTR(rc);
 }
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        struct dentry *lower_dentry;
        struct dentry *lower_dir_dentry;
        char *encoded_symname;
-        int encoded_symlen;
+        size_t encoded_symlen;
-        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(lower_dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
-        encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                                                  strlen(symname),
+                dir->i_sb)->mount_crypt_stat;
-                                                  &encoded_symname);
+        rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
-        if (encoded_symlen < 0) {
+                                                  &encoded_symlen,
-                rc = encoded_symlen;
+                                                  NULL,
+                                                  mount_crypt_stat, symname,
+                                                  strlen(symname));
+        if (rc)
                goto out_lock;
-        }
        rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
                         encoded_symname);
        kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
 }
 static int
-ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
-        int rc;
-        struct dentry *lower_dentry;
-        char *decoded_name;
        char *lower_buf;
-        mm_segment_t old_fs;
+        struct dentry *lower_dentry;
        struct ecryptfs_crypt_stat *crypt_stat;
+        char *plaintext_name;
+        size_t plaintext_name_size;
+        mm_segment_t old_fs;
+        int rc;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        if (!lower_dentry->d_inode->i_op ||
+        if (!lower_dentry->d_inode->i_op->readlink) {
-            !lower_dentry->d_inode->i_op->readlink) {
                rc = -EINVAL;
                goto out;
        }
+        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        /* Released in this function */
        lower_buf = kmalloc(bufsiz, GFP_KERNEL);
        if (lower_buf == NULL) {
-                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%d] bytes\n", __func__, bufsiz);
                rc = -ENOMEM;
                goto out;
        }
        old_fs = get_fs();
        set_fs(get_ds());
-        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-                        "lower_dentry->d_name.name = [%s]\n",
-                        lower_dentry->d_name.name);
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                   (char __user *)lower_buf,
                                                   bufsiz);
        set_fs(old_fs);
        if (rc >= 0) {
-                crypt_stat = NULL;
+                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
-                rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
+                                                          &plaintext_name_size,
-                                              &decoded_name);
+                                                          dentry, lower_buf,
-                if (rc == -ENOMEM)
+                                                          rc);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to decode and "
+                               "decrypt filename; rc = [%d]\n", __func__,
+                                rc);
                        goto out_free_lower_buf;
-                if (rc > 0) {
-                        ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
-                                        "to userspace: [%*s]\n", rc,
-                                        decoded_name);
-                        if (copy_to_user(buf, decoded_name, rc))
-                                rc = -EFAULT;
                }
-                kfree(decoded_name);
+                rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
-                fsstack_copy_attr_atime(dentry->d_inode,
+                if (rc)
-                                        lower_dentry->d_inode);
+                        rc = -EFAULT;
+                else
+                        rc = plaintext_name_size;
+                kfree(plaintext_name);
+                fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
        }
 out_free_lower_buf:
        kfree(lower_buf);
@@ -670,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        }
        old_fs = get_fs();
        set_fs(get_ds());
-        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-                        "dentry->d_name.name = [%s]\n", dentry->d_name.name);
        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
        set_fs(old_fs);
        if (rc < 0)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
        /* verify that everything through the encrypted FEK size is present */
        if (message_len < 4) {
                rc = -EIO;
-                printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+                printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
                       "message length is [%d]\n", __func__, message_len, 4);
                goto out;
        }
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
        i += data_len;
        if (message_len < (i + key_rec->enc_key_size)) {
                rc = -EIO;
-                printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+                printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
                       __func__, message_len, (i + key_rec->enc_key_size));
                goto out;
        }
        if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
                rc = -EIO;
-                printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+                printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
                       "the maximum key size [%d]\n", __func__,
                       key_rec->enc_key_size,
                       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
 }
 static int
+ecryptfs_find_global_auth_tok_for_sig(
+        struct ecryptfs_global_auth_tok **global_auth_tok,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+        struct ecryptfs_global_auth_tok *walker;
+        int rc = 0;
+        (*global_auth_tok) = NULL;
+        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+        list_for_each_entry(walker,
+                            &mount_crypt_stat->global_auth_tok_list,
+                            mount_crypt_stat_list) {
+                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+                        (*global_auth_tok) = walker;
+                        goto out;
+                }
+        }
+        rc = -EINVAL;
+out:
+        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+        return rc;
+}
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+        struct ecryptfs_auth_tok **auth_tok,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+        char *sig)
+{
+        struct ecryptfs_global_auth_tok *global_auth_tok;
+        int rc = 0;
+        (*auth_tok) = NULL;
+        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+                                                  mount_crypt_stat, sig)) {
+                struct key *auth_tok_key;
+                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+                                                       sig);
+        } else
+                (*auth_tok) = global_auth_tok->global_auth_tok;
+        return rc;
+}
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+        u8 cipher_code;
+        size_t max_packet_size;
+        size_t packet_size_len;
+        size_t block_aligned_filename_size;
+        size_t block_size;
+        size_t i;
+        size_t j;
+        size_t num_rand_bytes;
+        struct mutex *tfm_mutex;
+        char *block_aligned_filename;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct scatterlist src_sg;
+        struct scatterlist dst_sg;
+        struct blkcipher_desc desc;
+        char iv[ECRYPTFS_MAX_IV_BYTES];
+        char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+        char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+        struct hash_desc hash_desc;
+        struct scatterlist hash_sg;
+};
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *filename, size_t filename_size)
+{
+        struct ecryptfs_write_tag_70_packet_silly_stack *s;
+        int rc = 0;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s) {
+                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                goto out;
+        }
+        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        (*packet_size) = 0;
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+                &s->desc.tfm,
+                &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+        if (unlikely(rc)) {
+                printk(KERN_ERR "Internal error whilst attempting to get "
+                       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+                       mount_crypt_stat->global_default_fn_cipher_name, rc);
+                goto out;
+        }
+        mutex_lock(s->tfm_mutex);
+        s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+        /* Plus one for the \0 separator between the random prefix
+         * and the plaintext filename */
+        s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+        s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+        if ((s->block_aligned_filename_size % s->block_size) != 0) {
+                s->num_rand_bytes += (s->block_size
+                                      - (s->block_aligned_filename_size
+                                         % s->block_size));
+                s->block_aligned_filename_size = (s->num_rand_bytes
+                                                  + filename_size);
+        }
+        /* Octet 0: Tag 70 identifier
+         * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+         *              and block-aligned encrypted filename size)
+         * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+         * Octet N2-N3: Cipher identifier (1 octet)
+         * Octets N3-N4: Block-aligned encrypted filename
+         *  - Consists of a minimum number of random characters, a \0
+         *    separator, and then the filename */
+        s->max_packet_size = (1                   /* Tag 70 identifier */
+                              + 3                 /* Max Tag 70 packet size */
+                              + ECRYPTFS_SIG_SIZE /* FNEK sig */
+                              + 1                 /* Cipher identifier */
+                              + s->block_aligned_filename_size);
+        if (dest == NULL) {
+                (*packet_size) = s->max_packet_size;
+                goto out_unlock;
+        }
+        if (s->max_packet_size > (*remaining_bytes)) {
+                printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+                       "[%zd] available\n", __func__, s->max_packet_size,
+                       (*remaining_bytes));
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+                                            GFP_KERNEL);
+        if (!s->block_aligned_filename) {
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+                       "kzalloc [%zd] bytes\n", __func__,
+                       s->block_aligned_filename_size);
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        s->i = 0;
+        dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+        rc = ecryptfs_write_packet_length(&dest[s->i],
+                                          (ECRYPTFS_SIG_SIZE
+                                           + 1 /* Cipher code */
+                                           + s->block_aligned_filename_size),
+                                          &s->packet_size_len);
+        if (rc) {
+                printk(KERN_ERR "%s: Error generating tag 70 packet "
+                       "header; cannot generate packet length; rc = [%d]\n",
+                       __func__, rc);
+                goto out_free_unlock;
+        }
+        s->i += s->packet_size_len;
+        ecryptfs_from_hex(&dest[s->i],
+                          mount_crypt_stat->global_default_fnek_sig,
+                          ECRYPTFS_SIG_SIZE);
+        s->i += ECRYPTFS_SIG_SIZE;
+        s->cipher_code = ecryptfs_code_for_cipher_string(
+                mount_crypt_stat->global_default_fn_cipher_name,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (s->cipher_code == 0) {
+                printk(KERN_WARNING "%s: Unable to generate code for "
+                       "cipher [%s] with key bytes [%zd]\n", __func__,
+                       mount_crypt_stat->global_default_fn_cipher_name,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        dest[s->i++] = s->cipher_code;
+        rc = ecryptfs_find_auth_tok_for_sig(
+                &s->auth_tok, mount_crypt_stat,
+                mount_crypt_stat->global_default_fnek_sig);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__,
+                       mount_crypt_stat->global_default_fnek_sig, rc);
+                goto out_free_unlock;
+        }
+        /* TODO: Support other key modules than passphrase for
+         * filename encryption */
+        BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+        sg_init_one(
+                &s->hash_sg,
+                (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+                s->auth_tok->token.password.session_key_encryption_key_bytes);
+        s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+                                             CRYPTO_ALG_ASYNC);
+        if (IS_ERR(s->hash_desc.tfm)) {
+                        rc = PTR_ERR(s->hash_desc.tfm);
+                        printk(KERN_ERR "%s: Error attempting to "
+                               "allocate hash crypto context; rc = [%d]\n",
+                               __func__, rc);
+                        goto out_free_unlock;
+        }
+        rc = crypto_hash_init(&s->hash_desc);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error initializing crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_hash_update(
+                &s->hash_desc, &s->hash_sg,
+                s->auth_tok->token.password.session_key_encryption_key_bytes);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error updating crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_hash_final(&s->hash_desc, s->hash);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error finalizing crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+                s->block_aligned_filename[s->j] =
+                        s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+                if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+                    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+                        sg_init_one(&s->hash_sg, (u8 *)s->hash,
+                                    ECRYPTFS_TAG_70_DIGEST_SIZE);
+                        rc = crypto_hash_init(&s->hash_desc);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error initializing crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+                                                ECRYPTFS_TAG_70_DIGEST_SIZE);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error updating crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error finalizing crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        memcpy(s->hash, s->tmp_hash,
+                               ECRYPTFS_TAG_70_DIGEST_SIZE);
+                }
+                if (s->block_aligned_filename[s->j] == '\0')
+                        s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+        }
+        memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+               filename_size);
+        rc = virt_to_scatterlist(s->block_aligned_filename,
+                                 s->block_aligned_filename_size, &s->src_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_release_free_unlock;
+        }
+        rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+                                 &s->dst_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert encrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_release_free_unlock;
+        }
+        /* The characters in the first block effectively do the job
+         * of the IV here, so we just use 0's for the IV. Note the
+         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+         * >= ECRYPTFS_MAX_IV_BYTES. */
+        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        s->desc.info = s->iv;
+        rc = crypto_blkcipher_setkey(
+                s->desc.tfm,
+                s->auth_tok->token.password.session_key_encryption_key,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (rc < 0) {
+                printk(KERN_ERR "%s: Error setting key for crypto context; "
+                       "rc = [%d]. s->auth_tok->token.password.session_key_"
+                       "encryption_key = [0x%p]; mount_crypt_stat->"
+                       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+                       rc,
+                       s->auth_tok->token.password.session_key_encryption_key,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+                                         s->block_aligned_filename_size);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_release_free_unlock;
+        }
+        s->i += s->block_aligned_filename_size;
+        (*packet_size) = s->i;
+        (*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+        crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+        memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+        kfree(s->block_aligned_filename);
+out_unlock:
+        mutex_unlock(s->tfm_mutex);
+out:
+        kfree(s);
+        return rc;
+}
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+        u8 cipher_code;
+        size_t max_packet_size;
+        size_t packet_size_len;
+        size_t parsed_tag_70_packet_size;
+        size_t block_aligned_filename_size;
+        size_t block_size;
+        size_t i;
+        struct mutex *tfm_mutex;
+        char *decrypted_filename;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct scatterlist src_sg;
+        struct scatterlist dst_sg;
+        struct blkcipher_desc desc;
+        char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+        char iv[ECRYPTFS_MAX_IV_BYTES];
+        char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *data, size_t max_packet_size)
+{
+        struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+        int rc = 0;
+        (*packet_size) = 0;
+        (*filename_size) = 0;
+        (*filename) = NULL;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s) {
+                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                goto out;
+        }
+        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+                printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+                       "at least [%d]\n", __func__, max_packet_size,
+                        (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+                rc = -EINVAL;
+                goto out;
+        }
+        /* Octet 0: Tag 70 identifier
+         * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+         *              and block-aligned encrypted filename size)
+         * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+         * Octet N2-N3: Cipher identifier (1 octet)
+         * Octets N3-N4: Block-aligned encrypted filename
+         *  - Consists of a minimum number of random numbers, a \0
+         *    separator, and then the filename */
+        if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+                printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+                       "tag [0x%.2x]\n", __func__,
+                       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+                                          &s->parsed_tag_70_packet_size,
+                                          &s->packet_size_len);
+        if (rc) {
+                printk(KERN_WARNING "%s: Error parsing packet length; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out;
+        }
+        s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+                                          - ECRYPTFS_SIG_SIZE - 1);
+        if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+            > max_packet_size) {
+                printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+                       "size is [%zd]\n", __func__, max_packet_size,
+                       (1 + s->packet_size_len + 1
+                        + s->block_aligned_filename_size));
+                rc = -EINVAL;
+                goto out;
+        }
+        (*packet_size) += s->packet_size_len;
+        ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+                        ECRYPTFS_SIG_SIZE);
+        s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+        (*packet_size) += ECRYPTFS_SIG_SIZE;
+        s->cipher_code = data[(*packet_size)++];
+        rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+        if (rc) {
+                printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+                       __func__, s->cipher_code);
+                goto out;
+        }
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+                                                        &s->tfm_mutex,
+                                                        s->cipher_string);
+        if (unlikely(rc)) {
+                printk(KERN_ERR "Internal error whilst attempting to get "
+                       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+                       s->cipher_string, rc);
+                goto out;
+        }
+        mutex_lock(s->tfm_mutex);
+        rc = virt_to_scatterlist(&data[(*packet_size)],
+                                 s->block_aligned_filename_size, &s->src_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert encrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_unlock;
+        }
+        (*packet_size) += s->block_aligned_filename_size;
+        s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+                                        GFP_KERNEL);
+        if (!s->decrypted_filename) {
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%zd] bytes\n", __func__,
+                       s->block_aligned_filename_size);
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        rc = virt_to_scatterlist(s->decrypted_filename,
+                                 s->block_aligned_filename_size, &s->dst_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert decrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_free_unlock;
+        }
+        /* The characters in the first block effectively do the job of
+         * the IV here, so we just use 0's for the IV. Note the
+         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+         * >= ECRYPTFS_MAX_IV_BYTES. */
+        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        s->desc.info = s->iv;
+        rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+                                            s->fnek_sig_hex);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+                       rc);
+                goto out_free_unlock;
+        }
+        /* TODO: Support other key modules than passphrase for
+         * filename encryption */
+        BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+        rc = crypto_blkcipher_setkey(
+                s->desc.tfm,
+                s->auth_tok->token.password.session_key_encryption_key,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (rc < 0) {
+                printk(KERN_ERR "%s: Error setting key for crypto context; "
+                       "rc = [%d]. s->auth_tok->token.password.session_key_"
+                       "encryption_key = [0x%p]; mount_crypt_stat->"
+                       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+                       rc,
+                       s->auth_tok->token.password.session_key_encryption_key,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                goto out_free_unlock;
+        }
+        rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+                                         s->block_aligned_filename_size);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_free_unlock;
+        }
+        s->i = 0;
+        while (s->decrypted_filename[s->i] != '\0'
+               && s->i < s->block_aligned_filename_size)
+                s->i++;
+        if (s->i == s->block_aligned_filename_size) {
+                printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+                       "find valid separator between random characters and "
+                       "the filename\n", __func__);
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        s->i++;
+        (*filename_size) = (s->block_aligned_filename_size - s->i);
+        if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+                printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+                       "invalid\n", __func__, (*filename_size));
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+        if (!(*filename)) {
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%zd] bytes\n", __func__,
+                       ((*filename_size) + 1));
+                rc = -ENOMEM;
+                goto out_free_unlock;
+        }
+        memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+        (*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+        kfree(s->decrypted_filename);
+out_unlock:
+        mutex_unlock(s->tfm_mutex);
+out:
+        if (rc) {
+                (*packet_size) = 0;
+                (*filename_size) = 0;
+                (*filename) = NULL;
+        }
+        kfree(s);
+        return rc;
+}
+static int
 ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
 {
        int rc = 0;
@@ -897,30 +1471,6 @@ out:
        return rc;
 }
-static int
-ecryptfs_find_global_auth_tok_for_sig(
-        struct ecryptfs_global_auth_tok **global_auth_tok,
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
-{
-        struct ecryptfs_global_auth_tok *walker;
-        int rc = 0;
-        (*global_auth_tok) = NULL;
-        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
-        list_for_each_entry(walker,
-                            &mount_crypt_stat->global_auth_tok_list,
-                            mount_crypt_stat_list) {
-                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
-                        (*global_auth_tok) = walker;
-                        goto out;
-                }
-        }
-        rc = -EINVAL;
-out:
-        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-        return rc;
-}
 /**
 * ecryptfs_verify_version
 * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
 }
 /**
- * ecryptfs_find_auth_tok_for_sig
- * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
- * @sig: Sig of auth_tok to find
- *
- * For now, this function simply looks at the registered auth_tok's
- * linked off the mount_crypt_stat, so all the auth_toks that can be
- * used must be registered at mount time. This function could
- * potentially try a lot harder to find auth_tok's (e.g., by calling
- * out to ecryptfsd to dynamically retrieve an auth_tok object) so
- * that static registration of auth_tok's will no longer be necessary.
- *
- * Returns zero on no error; non-zero on error
- */
-static int
-ecryptfs_find_auth_tok_for_sig(
-        struct ecryptfs_auth_tok **auth_tok,
-        struct ecryptfs_crypt_stat *crypt_stat, char *sig)
-{
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                crypt_stat->mount_crypt_stat;
-        struct ecryptfs_global_auth_tok *global_auth_tok;
-        int rc = 0;
-        (*auth_tok) = NULL;
-        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
-                                                  mount_crypt_stat, sig)) {
-                struct key *auth_tok_key;
-                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
-                                                       sig);
-        } else
-                (*auth_tok) = global_auth_tok->global_auth_tok;
-        return rc;
-}
-/**
 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
 * @auth_tok: The passphrase authentication token to use to encrypt the FEK
 * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
                        rc = -EINVAL;
                        goto out_wipe_list;
                }
-                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat,
+                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+                                               crypt_stat->mount_crypt_stat,
                                               candidate_auth_tok_sig);
                if (matching_auth_tok) {
                        found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
        int rc;
        rc = write_tag_66_packet(auth_tok->token.private_key.signature,
-                                 ecryptfs_code_for_cipher_string(crypt_stat),
+                                 ecryptfs_code_for_cipher_string(
+                                         crypt_stat->cipher,
+                                         crypt_stat->key_size),
                                 crypt_stat, &payload, &payload_len);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
        dest[(*packet_size)++] = 0x04; /* version 4 */
        /* TODO: Break from RFC2440 so that arbitrary ciphers can be
         * specified with strings */
-        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+                                                      crypt_stat->key_size);
        if (cipher_code == 0) {
                ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
                                "cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
       ecryptfs_opt_ecryptfs_key_bytes,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
-       ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
        {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
        {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
        {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+        {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_err, NULL}
 };
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        int rc = 0;
        int sig_set = 0;
        int cipher_name_set = 0;
+        int fn_cipher_name_set = 0;
        int cipher_key_bytes;
        int cipher_key_bytes_set = 0;
+        int fn_cipher_key_bytes;
+        int fn_cipher_key_bytes_set = 0;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
        substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        char *sig_src;
        char *cipher_name_dst;
        char *cipher_name_src;
+        char *fn_cipher_name_dst;
+        char *fn_cipher_name_src;
+        char *fnek_dst;
+        char *fnek_src;
        char *cipher_key_bytes_src;
+        char *fn_cipher_key_bytes_src;
        if (!options) {
                rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                global_default_cipher_name;
                        strncpy(cipher_name_dst, cipher_name_src,
                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-                        ecryptfs_printk(KERN_DEBUG,
+                        cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
-                                        "The mount_crypt_stat "
-                                        "global_default_cipher_name set to: "
-                                        "[%s]\n", cipher_name_dst);
                        cipher_name_set = 1;
                        break;
                case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                                   &cipher_key_bytes_src, 0);
                        mount_crypt_stat->global_default_cipher_key_size =
                                cipher_key_bytes;
-                        ecryptfs_printk(KERN_DEBUG,
-                                        "The mount_crypt_stat "
-                                        "global_default_cipher_key_size "
-                                        "set to: [%d]\n", mount_crypt_stat->
-                                        global_default_cipher_key_size);
                        cipher_key_bytes_set = 1;
                        break;
                case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                        mount_crypt_stat->flags |=
                                ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
                        break;
+                case ecryptfs_opt_fnek_sig:
+                        fnek_src = args[0].from;
+                        fnek_dst =
+                                mount_crypt_stat->global_default_fnek_sig;
+                        strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+                        mount_crypt_stat->global_default_fnek_sig[
+                                ECRYPTFS_SIG_SIZE_HEX] = '\0';
+                        rc = ecryptfs_add_global_auth_tok(
+                                mount_crypt_stat,
+                                mount_crypt_stat->global_default_fnek_sig);
+                        if (rc) {
+                                printk(KERN_ERR "Error attempting to register "
+                                       "global fnek sig [%s]; rc = [%d]\n",
+                                       mount_crypt_stat->global_default_fnek_sig,
+                                       rc);
+                                goto out;
+                        }
+                        mount_crypt_stat->flags |=
+                                (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+                                 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+                        break;
+                case ecryptfs_opt_fn_cipher:
+                        fn_cipher_name_src = args[0].from;
+                        fn_cipher_name_dst =
+                                mount_crypt_stat->global_default_fn_cipher_name;
+                        strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+                        mount_crypt_stat->global_default_fn_cipher_name[
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+                        fn_cipher_name_set = 1;
+                        break;
+                case ecryptfs_opt_fn_cipher_key_bytes:
+                        fn_cipher_key_bytes_src = args[0].from;
+                        fn_cipher_key_bytes =
+                                (int)simple_strtol(fn_cipher_key_bytes_src,
+                                                   &fn_cipher_key_bytes_src, 0);
+                        mount_crypt_stat->global_default_fn_cipher_key_bytes =
+                                fn_cipher_key_bytes;
+                        fn_cipher_key_bytes_set = 1;
+                        break;
                case ecryptfs_opt_err:
                default:
-                        ecryptfs_printk(KERN_WARNING,
+                        printk(KERN_WARNING
-                                        "eCryptfs: unrecognized option '%s'\n",
+                               "%s: eCryptfs: unrecognized option [%s]\n",
-                                        p);
+                               __func__, p);
                }
        }
        if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
                BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
                strcpy(mount_crypt_stat->global_default_cipher_name,
                       ECRYPTFS_DEFAULT_CIPHER);
        }
-        if (!cipher_key_bytes_set) {
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !fn_cipher_name_set)
+                strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+                       mount_crypt_stat->global_default_cipher_name);
+        if (!cipher_key_bytes_set)
                mount_crypt_stat->global_default_cipher_key_size = 0;
-        }
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !fn_cipher_key_bytes_set)
+                mount_crypt_stat->global_default_fn_cipher_key_bytes =
+                        mount_crypt_stat->global_default_cipher_key_size;
        mutex_lock(&key_tfm_list_mutex);
        if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
-                                 NULL))
+                                 NULL)) {
                rc = ecryptfs_add_new_key_tfm(
                        NULL, mount_crypt_stat->global_default_cipher_name,
                        mount_crypt_stat->global_default_cipher_key_size);
-        mutex_unlock(&key_tfm_list_mutex);
+                if (rc) {
-        if (rc) {
+                        printk(KERN_ERR "Error attempting to initialize "
-                printk(KERN_ERR "Error attempting to initialize cipher with "
+                               "cipher with name = [%s] and key size = [%td]; "
-                       "name = [%s] and key size = [%td]; rc = [%d]\n",
+                               "rc = [%d]\n",
-                       mount_crypt_stat->global_default_cipher_name,
+                               mount_crypt_stat->global_default_cipher_name,
-                       mount_crypt_stat->global_default_cipher_key_size, rc);
+                               mount_crypt_stat->global_default_cipher_key_size,
-                rc = -EINVAL;
+                               rc);
-                goto out;
+                        rc = -EINVAL;
+                        mutex_unlock(&key_tfm_list_mutex);
+                        goto out;
+                }
        }
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !ecryptfs_tfm_exists(
+                    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+                rc = ecryptfs_add_new_key_tfm(
+                        NULL, mount_crypt_stat->global_default_fn_cipher_name,
+                        mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                if (rc) {
+                        printk(KERN_ERR "Error attempting to initialize "
+                               "cipher with name = [%s] and key size = [%td]; "
+                               "rc = [%d]\n",
+                               mount_crypt_stat->global_default_fn_cipher_name,
+                               mount_crypt_stat->global_default_fn_cipher_key_bytes,
+                               rc);
+                        rc = -EINVAL;
+                        mutex_unlock(&key_tfm_list_mutex);
+                        goto out;
+                }
+        }
+        mutex_unlock(&key_tfm_list_mutex);
        rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
-        if (rc) {
+        if (rc)
                printk(KERN_WARNING "One or more global auth toks could not "
                       "properly register; rc = [%d]\n", rc);
-        }
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
        (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
        if (!(*daemon)) {
                rc = -ENOMEM;
-                printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
                goto out;
        }
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
        msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
        if (!msg_ctx->msg) {
                rc = -ENOMEM;
-                printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, msg_size);
                goto unlock;
        }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
                if (!msg_ctx->msg) {
                        rc = -ENOMEM;
                        printk(KERN_ERR "%s: Out of memory whilst attempting "
-                               "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+                               "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
                               (sizeof(*msg_ctx->msg) + data_size));
                        goto out_unlock;
                }
@@ -322,7 +322,7 @@ check_list:
        if (count < total_length) {
                rc = 0;
                printk(KERN_WARNING "%s: Only given user buffer of "
-                       "size [%Zd], but we need [%Zd] to read the "
+                       "size [%zd], but we need [%zd] to read the "
                       "pending message\n", __func__, count, total_length);
                goto out_unlock_msg_ctx;
        }
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
        if ((sizeof(*msg) + msg->data_len) != data_size) {
                printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
-                       "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+                       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
                       (sizeof(*msg) + msg->data_len), data_size);
                rc = -EINVAL;
                goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        data = kmalloc(count, GFP_KERNEL);
        if (!data) {
                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+                       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
                goto out;
        }
        rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        case ECRYPTFS_MSG_RESPONSE:
                if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
                        printk(KERN_WARNING "%s: Minimum acceptable packet "
-                               "size is [%Zd], but amount of data written is "
+                               "size is [%zd], but amount of data written is "
-                               "only [%Zd]. Discarding response packet.\n",
+                               "only [%zd]. Discarding response packet.\n",
                               __func__,
                               (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
                               count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
                }
                i += packet_size_length;
                if ((1 + 4 + packet_size_length + packet_size) != count) {
-                        printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
+                        printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
-                               " + packet_size([%Zd]))([%Zd]) != "
+                               " + packet_size([%zd]))([%zd]) != "
-                               "count([%Zd]). Invalid packet format.\n",
+                               "count([%zd]). Invalid packet format.\n",
                               __func__, packet_size_length, packet_size,
                               (1 + packet_size_length + packet_size), count);
                        goto out_free;
diff --git a/fs/exec.c b/fs/exec.c
index 3ef9cf9b1871..71a6efe5d8bd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
 #include <linux/audit.h>
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (IS_ERR(file))
                goto out;
+        fsnotify_open(file->f_path.dentry);
        error = -ENOEXEC;
        if(file->f_op) {
                struct linux_binfmt * fmt;
@@ -229,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 static int __bprm_mm_init(struct linux_binprm *bprm)
 {
-        int err = -ENOMEM;
+        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;
        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
-                goto err;
+                return -ENOMEM;
        down_write(&mm->mmap_sem);
        vma->vm_mm = mm;
@@ -248,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         */
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        err = insert_vm_struct(mm, vma);
-        if (err) {
+        if (err)
-                up_write(&mm->mmap_sem);
                goto err;
-        }
        mm->stack_vm = mm->total_vm = 1;
        up_write(&mm->mmap_sem);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
 err:
-        if (vma) {
+        up_write(&mm->mmap_sem);
-                bprm->vma = NULL;
+        bprm->vma = NULL;
-                kmem_cache_free(vm_area_cachep, vma);
+        kmem_cache_free(vm_area_cachep, vma);
-        }
        return err;
 }
@@ -684,6 +679,8 @@ struct file *open_exec(const char *name)
        if (IS_ERR(file))
                return file;
+        fsnotify_open(file->f_path.dentry);
        err = deny_write_access(file);
        if (err) {
                fput(file);
@@ -1689,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm)
        return (ret >= 2) ? 2 : ret;
 }
-int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
@@ -1773,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        if (ispipe) {
                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                if (!helper_argv) {
+                        printk(KERN_WARNING "%s failed to allocate memory\n",
+                               __func__);
+                        goto fail_unlock;
+                }
                /* Terminate the string before the first option */
                delimit = strchr(corename, ' ');
                if (delimit)
@@ -1840,5 +1842,5 @@ fail_unlock:
        put_cred(cred);
        coredump_finish(mm);
 fail:
-        return retval;
+        return;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a5..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        memset(ei->i_data, 0, sizeof(ei->i_data));
-        ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
-                ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-        /* dirsync is only applied to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT2_DIRSYNC_FL;
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
        ei->i_frag_size = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb74..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
 * ext2_splice_branch - splice the allocated branch onto inode.
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext2_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto setflags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext2_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT2_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
        percpu_counter_destroy(&sbi->s_dirs_counter);
        brelse (sbi->s_sbh);
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                printk ("EXT2-fs: not enough memory\n");
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
        if (!sbi->s_debts) {
                printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11f..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
-        ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
-                ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-        /* dirsync only applies to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT3_DIRSYNC_FL;
 #ifdef EXT3_FRAGMENTS
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        goto flags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext3_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT3_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2c2d700c1ccf..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6900ff05e3ab..5d047a030a73 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
                ext3_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -733,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
        .acquire_dquot  = ext3_acquire_dquot,
        .release_dquot  = ext3_release_dquot,
        .mark_dirty     = ext3_mark_dquot_dirty,
-        .write_info     = ext3_write_info
+        .write_info     = ext3_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops ext3_qctl_operations = {
@@ -1056,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                        "EXT3-fs: Cannot change journaled "
@@ -1096,8 +1098,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1116,8 +1117,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1136,8 +1136,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb)) {
-                            sb_any_quota_suspended(sb)) {
                                printk(KERN_ERR "EXT3-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
@@ -1569,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1821,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logic_sb_block, i);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index db1718833f58..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1247,11 +1247,11 @@ do {								\
 } while (0)
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
 * counters. So we need to make sure we have free blocks more
- * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 240cf0daad4b..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2533,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 */
                newdepth = ext_depth(inode);
                /*
-                 * update the extent length after successfull insert of the
+                 * update the extent length after successful insert of the
                 * split extent
                 */
                orig_ex.ee_len = cpu_to_le16(ee_len -
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4cac8da4e0c1..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2592,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
-         * counters can get slightly wrong with FBC_BATCH getting
+         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 183a09a8b14e..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index acb69c00fd42..8f7e0be8ab1b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -953,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
        .mark_dirty     = ext4_mark_dquot_dirty,
-        .write_info     = ext4_write_info
+        .write_info     = ext4_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops ext4_qctl_operations = {
@@ -1302,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                       "EXT4-fs: Cannot change journaled "
@@ -1342,8 +1343,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1362,8 +1362,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1382,7 +1381,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb)) {
+                        if (sb_any_quota_loaded(sb)) {
                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..d488dcd7f2bb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
 module_init(proc_filesystems_init);
 #endif
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
 {
        struct file_system_type *fs;
-        const char *dot = strchr(name, '.');
-        unsigned len = dot ? dot - name : strlen(name);
        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
-        if (!fs && (request_module("%.*s", len, name) == 0)) {
+        return fs;
-                read_lock(&file_systems_lock);
+}
-                fs = *(find_filesystem(name, len));
-                if (fs && !try_module_get(fs->owner))
+struct file_system_type *get_fs_type(const char *name)
-                        fs = NULL;
+{
-                read_unlock(&file_systems_lock);
+        struct file_system_type *fs;
-        }
+        const char *dot = strchr(name, '.');
+        int len = dot ? dot - name : strlen(name);
+        fs = __get_fs_type(name, len);
+        if (!fs && (request_module("%.*s", len, name) == 0))
+                fs = __get_fs_type(name, len);
        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * If we're a pdlfush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
- * that it can be located for waiting on in __writeback_single_inode().
- *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
                                struct writeback_control *wbc)
 {
        const unsigned long start = jiffies;    /* livelock avoidance */
+        int sync = wbc->sync_mode == WB_SYNC_ALL;
        spin_lock(&inode_lock);
        if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                __writeback_single_inode(inode, wbc);
-                if (wbc->sync_mode == WB_SYNC_HOLD) {
-                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &sb->s_dirty);
-                }
                if (current_is_pdflush())
                        writeback_release(bdi);
                if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
                if (!list_empty(&sb->s_more_io))
                        wbc->more_io = 1;
        }
-        spin_unlock(&inode_lock);
+        if (sync) {
+                struct inode *inode, *old_inode = NULL;
+                /*
+                 * Data integrity sync. Must wait for all pages under writeback,
+                 * because there may have been pages dirtied before our sync
+                 * call, but which had writeout started before we write it out.
+                 * In which case, the inode may not be on the dirty list, but
+                 * we still have to wait for that writeout.
+                 */
+                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+                        struct address_space *mapping;
+                        if (inode->i_state & (I_FREEING|I_WILL_FREE))
+                                continue;
+                        mapping = inode->i_mapping;
+                        if (mapping->nrpages == 0)
+                                continue;
+                        __iget(inode);
+                        spin_unlock(&inode_lock);
+                        /*
+                         * We hold a reference to 'inode' so it couldn't have
+                         * been removed from s_inodes list while we dropped the
+                         * inode_lock.  We cannot iput the inode now as we can
+                         * be holding the last reference and we cannot iput it
+                         * under inode_lock. So we keep the reference and iput
+                         * it later.
+                         */
+                        iput(old_inode);
+                        old_inode = inode;
+                        filemap_fdatawait(mapping);
+                        cond_resched();
+                        spin_lock(&inode_lock);
+                }
+                spin_unlock(&inode_lock);
+                iput(old_inode);
+        } else
+                spin_unlock(&inode_lock);
        return;         /* Leave any unwritten inodes on s_io */
 }
 EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
 /*
 * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
+ * do this in two passes - one to write, and one to wait.
- * used to park the written inodes on sb->s_dirty for the wait pass.
 *
 * A finite limit is set on the number of pages which will be written.
 * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
 void sync_inodes_sb(struct super_block *sb, int wait)
 {
        struct writeback_control wbc = {
-                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start    = 0,
                .range_end      = LLONG_MAX,
        };
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write = nr_dirty + nr_unstable +
+        if (!wait) {
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+                unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-                        nr_dirty + nr_unstable;
+                unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
-        sync_sb_inodes(sb, &wbc);
-}
-/*
+                wbc.nr_to_write = nr_dirty + nr_unstable +
- * Rather lame livelock avoidance.
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- */
+        } else
-static void set_sb_syncing(int val)
+                wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
-{
-        struct super_block *sb;
+        sync_sb_inodes(sb, &wbc);
-        spin_lock(&sb_lock);
-        list_for_each_entry_reverse(sb, &super_blocks, s_list)
-                sb->s_syncing = val;
-        spin_unlock(&sb_lock);
 }
 /**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_syncing)
-                        continue;
-                sb->s_syncing = 1;
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
 void sync_inodes(int wait)
 {
-        set_sb_syncing(0);
        __sync_inodes(0);
-        if (wait) {
+        if (wait)
-                set_sb_syncing(0);
                __sync_inodes(1);
-        }
 }
 /**
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
        size_t size;
        if (!*ppos) {
+                long value;
                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
                if (!fc)
                        return 0;
-                file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+                value = atomic_read(&fc->num_waiting);
+                file->private_data = (void *)value;
                fuse_conn_put(fc);
        }
        size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..e0c7ada08a1f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 * Called with fc->lock, unlocks it
 */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-        __releases(fc->lock)
+__releases(&fc->lock)
 {
        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
        req->end = NULL;
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
        wake_up(&req->waitq);
        if (end)
                end(fc, req);
-        else
+        fuse_put_request(fc, req);
-                fuse_put_request(fc, req);
 }
 static void wait_answer_interruptible(struct fuse_conn *fc,
                                      struct fuse_req *req)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        if (signal_pending(current))
                return;
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
        spin_lock(&fc->lock);
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
        spin_unlock(&fc->lock);
 }
-static void request_send_nowait_locked(struct fuse_conn *fc,
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
-                                       struct fuse_req *req)
+                                            struct fuse_req *req)
 {
        req->background = 1;
        fc->num_background++;
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
        flush_bg_queue(fc);
 }
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
        spin_lock(&fc->lock);
        if (fc->connected) {
-                request_send_nowait_locked(fc, req);
+                fuse_request_send_nowait_locked(fc, req);
                spin_unlock(&fc->lock);
        } else {
                req->out.h.error = -ENOTCONN;
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 0;
-        request_send_nowait(fc, req);
+        fuse_request_send_nowait(fc, req);
 }
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
-        request_send_nowait(fc, req);
+        fuse_request_send_nowait(fc, req);
 }
 /*
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 *
 * fc->connected must have been checked previously
 */
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                         struct fuse_req *req)
 {
        req->isreply = 1;
-        request_send_nowait_locked(fc, req);
+        fuse_request_send_nowait_locked(fc, req);
 }
 /*
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                BUG_ON(!cs->nr_segs);
                cs->seglen = cs->iov[0].iov_len;
                cs->addr = (unsigned long) cs->iov[0].iov_base;
-                cs->iov ++;
+                cs->iov++;
-                cs->nr_segs --;
+                cs->nr_segs--;
        }
        down_read(&current->mm->mmap_sem);
        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                int err;
+                if (!cs->len) {
-                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        int err = fuse_copy_fill(cs);
-                        return err;
+                        if (err)
+                                return err;
+                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
                        void *buf = mapaddr + offset;
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
        while (size) {
-                int err;
+                if (!cs->len) {
-                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        int err = fuse_copy_fill(cs);
-                        return err;
+                        if (err)
+                                return err;
+                }
                fuse_copy_do(cs, &val, &size);
        }
        return 0;
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        DECLARE_WAITQUEUE(wait, current);
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
 */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
                               const struct iovec *iov, unsigned long nr_segs)
-        __releases(fc->lock)
+__releases(&fc->lock)
 {
        struct fuse_copy_state cs;
        struct fuse_in_header ih;
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+                            struct fuse_copy_state *cs)
+{
+        struct fuse_notify_poll_wakeup_out outarg;
+        int err;
+        if (size != sizeof(outarg))
+                return -EINVAL;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                return err;
+        return fuse_notify_poll_wakeup(fc, &outarg);
+}
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+                       unsigned int size, struct fuse_copy_state *cs)
+{
+        switch (code) {
+        case FUSE_NOTIFY_POLL:
+                return fuse_notify_poll(fc, size, cs);
+        default:
+                return -EINVAL;
+        }
+}
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        err = fuse_copy_one(&cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
+        err = -EINVAL;
+        if (oh.len != nbytes)
+                goto err_finish;
+        /*
+         * Zero oh.unique indicates unsolicited notification message
+         * and error contains notification code.
+         */
+        if (!oh.unique) {
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                fuse_copy_finish(&cs);
+                return err ? err : nbytes;
+        }
        err = -EINVAL;
-        if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
+        if (oh.error <= -1000 || oh.error > 0)
-            oh.len != nbytes)
                goto err_finish;
        spin_lock(&fc->lock);
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 * This function releases and reacquires fc->lock
 */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        while (!list_empty(head)) {
                struct fuse_req *req;
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 * locked).
 */
 static void end_io_requests(struct fuse_conn *fc)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        while (!list_empty(&fc->io)) {
                struct fuse_req *req =
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc)
                wake_up(&req->waitq);
                if (end) {
                        req->end = NULL;
-                        /* The end function will consume this reference */
                        __fuse_get_request(req);
                        spin_unlock(&fc->lock);
                        wait_event(req->waitq, !req->locked);
                        end(fc, req);
+                        fuse_put_request(fc, req);
                        spin_lock(&fc->lock);
                }
        }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                parent = dget_parent(entry);
                fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
                                 &entry->d_name, &outarg);
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                dput(parent);
                err = req->out.h.error;
                fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                                return 0;
                        }
                        spin_lock(&fc->lock);
-                        fi->nlookup ++;
+                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
                fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        attr_version = fuse_get_attr_version(fc);
        fuse_lookup_init(fc, req, nodeid, name, outarg);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        /* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
 {
        fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
        ff->reserved_req->force = 1;
-        request_send(fc, ff->reserved_req);
+        fuse_request_send(fc, ff->reserved_req);
        fuse_put_request(fc, ff->reserved_req);
        kfree(ff);
 }
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
                goto out_put_forget_req;
        err = -ENOMEM;
-        ff = fuse_file_alloc();
+        ff = fuse_file_alloc(fc);
        if (!ff)
                goto out_put_request;
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        req->out.args[0].value = &outentry;
        req->out.args[1].size = sizeof(outopen);
        req->out.args[1].value = &outopen;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        if (err) {
                if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
                struct inode *inode = entry->d_inode;
-                /* Set nlink to zero so the inode can be cleared, if
+                /*
-                   the inode does have more links this will be
+                 * Set nlink to zero so the inode can be cleared, if the inode
-                   discovered at the next lookup/getattr */
+                 * does have more links this will be discovered at the next
+                 * lookup/getattr.
+                 */
                clear_nlink(inode);
                fuse_invalidate_attr(inode);
                fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        req->in.args[1].value = oldent->d_name.name;
        req->in.args[2].size = newent->d_name.len + 1;
        req->in.args[2].value = newent->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        req->num_pages = 1;
        req->pages[0] = page;
        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
        req->out.numargs = 1;
        req->out.args[0].size = PAGE_SIZE - 1;
        req->out.args[0].value = link;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        if (req->out.h.error) {
                free_page((unsigned long) link);
                link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
        req->in.args[1].value = name;
        req->in.args[2].size = size;
        req->in.args[2].value = value;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
                req->out.args[0].size = sizeof(outarg);
                req->out.args[0].value = &outarg;
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        ret = req->out.h.error;
        if (!ret)
                ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
                req->out.args[0].size = sizeof(outarg);
                req->out.args[0].value = &outarg;
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        ret = req->out.h.error;
        if (!ret)
                ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
        req->in.numargs = 1;
        req->in.args[0].size = strlen(name) + 1;
        req->in.args[0].value = name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4c9ee7011265..e8162646a9b5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(*outargp);
        req->out.args[0].value = outargp;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        return err;
 }
-struct fuse_file *fuse_file_alloc(void)
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void)
                } else {
                        INIT_LIST_HEAD(&ff->write_entry);
                        atomic_set(&ff->count, 0);
+                        spin_lock(&fc->lock);
+                        ff->kh = ++fc->khctr;
+                        spin_unlock(&fc->lock);
                }
+                RB_CLEAR_NODE(&ff->polled_node);
+                init_waitqueue_head(&ff->poll_wait);
        }
        return ff;
 }
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
        dput(req->misc.release.dentry);
        mntput(req->misc.release.vfsmount);
-        fuse_put_request(fc, req);
 }
 static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
                struct inode *inode = req->misc.release.dentry->d_inode;
                struct fuse_conn *fc = get_fuse_conn(inode);
                req->end = fuse_release_end;
-                request_send_background(fc, req);
+                fuse_request_send_background(fc, req);
                kfree(ff);
        }
 }
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
+        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_open_out outarg;
        struct fuse_file *ff;
        int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        if (err)
                return err;
-        ff = fuse_file_alloc();
+        ff = fuse_file_alloc(fc);
        if (!ff)
                return -ENOMEM;
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
                spin_lock(&fc->lock);
                list_del(&ff->write_entry);
+                if (!RB_EMPTY_NODE(&ff->polled_node))
+                        rb_erase(&ff->polled_node, &fc->polled_files);
                spin_unlock(&fc->lock);
+                wake_up_interruptible_sync(&ff->poll_wait);
                /*
                 * Normally this will send the RELEASE request,
                 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->force = 1;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
                inarg->read_flags |= FUSE_READ_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        return req->out.args[0].size;
 }
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        }
        if (req->ff)
                fuse_file_put(req->ff);
-        fuse_put_request(fc, req);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
                struct fuse_file *ff = file->private_data;
                req->ff = fuse_file_get(ff);
                req->end = fuse_readpages_end;
-                request_send_background(fc, req);
+                fuse_request_send_background(fc, req);
        } else {
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                fuse_readpages_end(fc, req);
+                fuse_put_request(fc, req);
        }
 }
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                }
        }
        req->pages[req->num_pages] = page;
-        req->num_pages ++;
+        req->num_pages++;
        return 0;
 }
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        return req->misc.write.out.size;
 }
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
        fuse_file_put(req->ff);
-        fuse_put_request(fc, req);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        struct fuse_inode *fi = get_fuse_inode(req->inode);
        loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
        req->in.args[1].size = inarg->size;
        fi->writectr++;
-        request_send_background_locked(fc, req);
+        fuse_request_send_background_locked(fc, req);
        return;
 out_free:
        fuse_writepage_finish(fc, req);
        spin_unlock(&fc->lock);
        fuse_writepage_free(fc, req);
+        fuse_put_request(fc, req);
        spin_lock(&fc->lock);
 }
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
 * Called with fc->lock
 */
 void fuse_flush_writepages(struct inode *inode)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
                return PTR_ERR(req);
        fuse_lk_fill(req, file, fl, opcode, pid, flock);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        /* locking is restartable */
        if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+                        unsigned int nr_segs, size_t bytes, bool to_user)
+{
+        struct iov_iter ii;
+        int page_idx = 0;
+        if (!bytes)
+                return 0;
+        iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+        while (iov_iter_count(&ii)) {
+                struct page *page = pages[page_idx++];
+                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+                void *kaddr, *map;
+                kaddr = map = kmap(page);
+                while (todo) {
+                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+                        size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+                        size_t copy = min(todo, iov_len);
+                        size_t left;
+                        if (!to_user)
+                                left = copy_from_user(kaddr, uaddr, copy);
+                        else
+                                left = copy_to_user(uaddr, kaddr, copy);
+                        if (unlikely(left))
+                                return -EFAULT;
+                        iov_iter_advance(&ii, copy);
+                        todo -= copy;
+                        kaddr += copy;
+                }
+                kunmap(map);
+        }
+        return 0;
+}
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *      char    *buf;
+ *      size_t  buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a)     },
+ *   { .iov_base = a.buf,       .iov_len = a.buflen             } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg, unsigned int flags)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_ioctl_in inarg = {
+                .fh = ff->fh,
+                .cmd = cmd,
+                .arg = arg,
+                .flags = flags
+        };
+        struct fuse_ioctl_out outarg;
+        struct fuse_req *req = NULL;
+        struct page **pages = NULL;
+        struct page *iov_page = NULL;
+        struct iovec *in_iov = NULL, *out_iov = NULL;
+        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+        size_t in_size, out_size, transferred;
+        int err;
+        /* assume all the iovs returned by client always fits in a page */
+        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        err = -EIO;
+        if (is_bad_inode(inode))
+                goto out;
+        err = -ENOMEM;
+        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+        iov_page = alloc_page(GFP_KERNEL);
+        if (!pages || !iov_page)
+                goto out;
+        /*
+         * If restricted, initialize IO parameters as encoded in @cmd.
+         * RETRY from server is not allowed.
+         */
+        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+                struct iovec *iov = page_address(iov_page);
+                iov->iov_base = (void __user *)arg;
+                iov->iov_len = _IOC_SIZE(cmd);
+                if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                        in_iov = iov;
+                        in_iovs = 1;
+                }
+                if (_IOC_DIR(cmd) & _IOC_READ) {
+                        out_iov = iov;
+                        out_iovs = 1;
+                }
+        }
+ retry:
+        inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+        inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+        /*
+         * Out data can be used either for actual out data or iovs,
+         * make sure there always is at least one page.
+         */
+        out_size = max_t(size_t, out_size, PAGE_SIZE);
+        max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+        /* make sure there are enough buffer pages and init request with them */
+        err = -ENOMEM;
+        if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+                goto out;
+        while (num_pages < max_pages) {
+                pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+                if (!pages[num_pages])
+                        goto out;
+                num_pages++;
+        }
+        req = fuse_get_req(fc);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                req = NULL;
+                goto out;
+        }
+        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+        req->num_pages = num_pages;
+        /* okay, let's send it to the client */
+        req->in.h.opcode = FUSE_IOCTL;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        if (in_size) {
+                req->in.numargs++;
+                req->in.args[1].size = in_size;
+                req->in.argpages = 1;
+                err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+                                           false);
+                if (err)
+                        goto out;
+        }
+        req->out.numargs = 2;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        req->out.args[1].size = out_size;
+        req->out.argpages = 1;
+        req->out.argvar = 1;
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        transferred = req->out.args[1].size;
+        fuse_put_request(fc, req);
+        req = NULL;
+        if (err)
+                goto out;
+        /* did it ask for retry? */
+        if (outarg.flags & FUSE_IOCTL_RETRY) {
+                char *vaddr;
+                /* no retry if in restricted mode */
+                err = -EIO;
+                if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+                        goto out;
+                in_iovs = outarg.in_iovs;
+                out_iovs = outarg.out_iovs;
+                /*
+                 * Make sure things are in boundary, separate checks
+                 * are to protect against overflow.
+                 */
+                err = -ENOMEM;
+                if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+                    out_iovs > FUSE_IOCTL_MAX_IOV ||
+                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+                        goto out;
+                err = -EIO;
+                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
+                        goto out;
+                /* okay, copy in iovs and retry */
+                vaddr = kmap_atomic(pages[0], KM_USER0);
+                memcpy(page_address(iov_page), vaddr, transferred);
+                kunmap_atomic(vaddr, KM_USER0);
+                in_iov = page_address(iov_page);
+                out_iov = in_iov + in_iovs;
+                goto retry;
+        }
+        err = -EIO;
+        if (transferred > inarg.out_size)
+                goto out;
+        err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+        if (req)
+                fuse_put_request(fc, req);
+        if (iov_page)
+                __free_page(iov_page);
+        while (num_pages)
+                __free_page(pages[--num_pages]);
+        kfree(pages);
+        return err ? err : outarg.result;
+}
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+                            unsigned long arg)
+{
+        return fuse_file_do_ioctl(file, cmd, arg, 0);
+}
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+                                   unsigned long arg)
+{
+        return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+                                              struct rb_node **parent_out)
+{
+        struct rb_node **link = &fc->polled_files.rb_node;
+        struct rb_node *last = NULL;
+        while (*link) {
+                struct fuse_file *ff;
+                last = *link;
+                ff = rb_entry(last, struct fuse_file, polled_node);
+                if (kh < ff->kh)
+                        link = &last->rb_left;
+                else if (kh > ff->kh)
+                        link = &last->rb_right;
+                else
+                        return link;
+        }
+        if (parent_out)
+                *parent_out = last;
+        return link;
+}
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+                                      struct fuse_file *ff)
+{
+        spin_lock(&fc->lock);
+        if (RB_EMPTY_NODE(&ff->polled_node)) {
+                struct rb_node **link, *parent;
+                link = fuse_find_polled_node(fc, ff->kh, &parent);
+                BUG_ON(*link);
+                rb_link_node(&ff->polled_node, parent, link);
+                rb_insert_color(&ff->polled_node, &fc->polled_files);
+        }
+        spin_unlock(&fc->lock);
+}
+static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+        struct fuse_poll_out outarg;
+        struct fuse_req *req;
+        int err;
+        if (fc->no_poll)
+                return DEFAULT_POLLMASK;
+        poll_wait(file, &ff->poll_wait, wait);
+        /*
+         * Ask for notification iff there's someone waiting for it.
+         * The client may ignore the flag and always notify.
+         */
+        if (waitqueue_active(&ff->poll_wait)) {
+                inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+                fuse_register_polled_file(fc, ff);
+        }
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->in.h.opcode = FUSE_POLL;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                return outarg.revents;
+        if (err == -ENOSYS) {
+                fc->no_poll = 1;
+                return DEFAULT_POLLMASK;
+        }
+        return POLLERR;
+}
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+                            struct fuse_notify_poll_wakeup_out *outarg)
+{
+        u64 kh = outarg->kh;
+        struct rb_node **link;
+        spin_lock(&fc->lock);
+        link = fuse_find_polled_node(fc, kh, NULL);
+        if (*link) {
+                struct fuse_file *ff;
+                ff = rb_entry(*link, struct fuse_file, polled_node);
+                wake_up_interruptible_sync(&ff->poll_wait);
+        }
+        spin_unlock(&fc->lock);
+        return 0;
+}
 static const struct file_operations fuse_file_operations = {
        .llseek         = fuse_file_llseek,
        .read           = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
        .lock           = fuse_file_lock,
        .flock          = fuse_file_flock,
        .splice_read    = generic_file_splice_read,
+        .unlocked_ioctl = fuse_file_ioctl,
+        .compat_ioctl   = fuse_file_compat_ioctl,
+        .poll           = fuse_file_poll,
 };
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .fsync          = fuse_fsync,
        .lock           = fuse_file_lock,
        .flock          = fuse_file_flock,
+        .unlocked_ioctl = fuse_file_ioctl,
+        .compat_ioctl   = fuse_file_compat_ioctl,
+        .poll           = fuse_file_poll,
        /* no mmap and splice_read */
 };
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -19,6 +19,8 @@
 #include <linux/backing-dev.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
        /** Request reserved for flush and release */
        struct fuse_req *reserved_req;
+        /** Kernel file handle guaranteed to be unique */
+        u64 kh;
        /** File handle used by userspace */
        u64 fh;
@@ -108,6 +113,12 @@ struct fuse_file {
        /** Entry on inode's write_files list */
        struct list_head write_entry;
+        /** RB node to be linked on fuse_conn->polled_files */
+        struct rb_node polled_node;
+        /** Wait queue head for poll */
+        wait_queue_head_t poll_wait;
 };
 /** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
        /** The list of requests under I/O */
        struct list_head io;
+        /** The next unique kernel file handle */
+        u64 khctr;
+        /** rbtree of fuse_files waiting for poll events indexed by ph */
+        struct rb_root polled_files;
        /** Number of requests currently in the background */
        unsigned num_background;
@@ -355,19 +372,19 @@ struct fuse_conn {
        /** Connection failed (version mismatch).  Cannot race with
            setting other bitfields since it is only set once in INIT
            reply, before any other request, and never cleared */
-        unsigned conn_error : 1;
+        unsigned conn_error:1;
        /** Connection successful.  Only set in INIT */
-        unsigned conn_init : 1;
+        unsigned conn_init:1;
        /** Do readpages asynchronously?  Only set in INIT */
-        unsigned async_read : 1;
+        unsigned async_read:1;
        /** Do not send separate SETATTR request before open(O_TRUNC)  */
-        unsigned atomic_o_trunc : 1;
+        unsigned atomic_o_trunc:1;
        /** Filesystem supports NFS exporting.  Only set in INIT */
-        unsigned export_support : 1;
+        unsigned export_support:1;
        /*
         * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
         */
        /** Is fsync not implemented by fs? */
-        unsigned no_fsync : 1;
+        unsigned no_fsync:1;
        /** Is fsyncdir not implemented by fs? */
-        unsigned no_fsyncdir : 1;
+        unsigned no_fsyncdir:1;
        /** Is flush not implemented by fs? */
-        unsigned no_flush : 1;
+        unsigned no_flush:1;
        /** Is setxattr not implemented by fs? */
-        unsigned no_setxattr : 1;
+        unsigned no_setxattr:1;
        /** Is getxattr not implemented by fs? */
-        unsigned no_getxattr : 1;
+        unsigned no_getxattr:1;
        /** Is listxattr not implemented by fs? */
-        unsigned no_listxattr : 1;
+        unsigned no_listxattr:1;
        /** Is removexattr not implemented by fs? */
-        unsigned no_removexattr : 1;
+        unsigned no_removexattr:1;
        /** Are file locking primitives not implemented by fs? */
-        unsigned no_lock : 1;
+        unsigned no_lock:1;
        /** Is access not implemented by fs? */
-        unsigned no_access : 1;
+        unsigned no_access:1;
        /** Is create not implemented by fs? */
-        unsigned no_create : 1;
+        unsigned no_create:1;
        /** Is interrupt not implemented by fs? */
-        unsigned no_interrupt : 1;
+        unsigned no_interrupt:1;
        /** Is bmap not implemented by fs? */
-        unsigned no_bmap : 1;
+        unsigned no_bmap:1;
+        /** Is poll not implemented by fs? */
+        unsigned no_poll:1;
        /** Do multi-page cached writes */
-        unsigned big_writes : 1;
+        unsigned big_writes:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
        /** Version counter for attribute changes */
        u64 attr_version;
+        /** Called on final put */
+        void (*release)(struct fuse_conn *);
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 */
 int fuse_open_common(struct inode *inode, struct file *file, int isdir);
-struct fuse_file *fuse_file_alloc(void);
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
                      struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
                      int isdir);
 /**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+                            struct fuse_notify_poll_wakeup_out *outarg);
+/**
 * Initialize file operations on a regular file
 */
 void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request (synchronous)
 */
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request with no reply
 */
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request in the background
 */
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                         struct fuse_req *req);
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 /**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+/**
 * Release reference to fuse_conn
 */
 void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..47c96fdca1ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
        unsigned rootmode;
        unsigned user_id;
        unsigned group_id;
-        unsigned fd_present : 1;
+        unsigned fd_present:1;
-        unsigned rootmode_present : 1;
+        unsigned rootmode_present:1;
-        unsigned user_id_present : 1;
+        unsigned user_id_present:1;
-        unsigned group_id_present : 1;
+        unsigned group_id_present:1;
        unsigned flags;
        unsigned max_read;
        unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_forget_in);
        req->in.args[0].value = inarg;
-        request_send_noreply(fc, req);
+        fuse_request_send_noreply(fc, req);
 }
 static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
        fi = get_fuse_inode(inode);
        spin_lock(&fc->lock);
-        fi->nlookup ++;
+        fi->nlookup++;
        spin_unlock(&fc->lock);
        fuse_change_attributes(inode, attr, attr_valid, attr_version);
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
                fc->destroy_req = NULL;
                req->in.h.opcode = FUSE_DESTROY;
                req->force = 1;
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                fuse_put_request(fc, req);
        }
 }
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        req->out.args[0].size =
                fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        if (!err)
                convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
 {
-        struct fuse_conn *fc;
        int err;
-        fc = kzalloc(sizeof(*fc), GFP_KERNEL);
+        memset(fc, 0, sizeof(*fc));
-        if (fc) {
+        spin_lock_init(&fc->lock);
-                spin_lock_init(&fc->lock);
+        mutex_init(&fc->inst_mutex);
-                mutex_init(&fc->inst_mutex);
+        atomic_set(&fc->count, 1);
-                atomic_set(&fc->count, 1);
+        init_waitqueue_head(&fc->waitq);
-                init_waitqueue_head(&fc->waitq);
+        init_waitqueue_head(&fc->blocked_waitq);
-                init_waitqueue_head(&fc->blocked_waitq);
+        init_waitqueue_head(&fc->reserved_req_waitq);
-                init_waitqueue_head(&fc->reserved_req_waitq);
+        INIT_LIST_HEAD(&fc->pending);
-                INIT_LIST_HEAD(&fc->pending);
+        INIT_LIST_HEAD(&fc->processing);
-                INIT_LIST_HEAD(&fc->processing);
+        INIT_LIST_HEAD(&fc->io);
-                INIT_LIST_HEAD(&fc->io);
+        INIT_LIST_HEAD(&fc->interrupts);
-                INIT_LIST_HEAD(&fc->interrupts);
+        INIT_LIST_HEAD(&fc->bg_queue);
-                INIT_LIST_HEAD(&fc->bg_queue);
+        INIT_LIST_HEAD(&fc->entry);
-                atomic_set(&fc->num_waiting, 0);
+        atomic_set(&fc->num_waiting, 0);
-                fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-                fc->bdi.unplug_io_fn = default_unplug_io_fn;
+        fc->bdi.unplug_io_fn = default_unplug_io_fn;
-                /* fuse does it's own writeback accounting */
+        /* fuse does it's own writeback accounting */
-                fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
-                fc->dev = sb->s_dev;
+        fc->khctr = 0;
-                err = bdi_init(&fc->bdi);
+        fc->polled_files = RB_ROOT;
-                if (err)
+        fc->dev = sb->s_dev;
-                        goto error_kfree;
+        err = bdi_init(&fc->bdi);
-                if (sb->s_bdev) {
+        if (err)
-                        err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+                goto error_mutex_destroy;
-                                           MAJOR(fc->dev), MINOR(fc->dev));
+        if (sb->s_bdev) {
-                } else {
+                err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-                        err = bdi_register_dev(&fc->bdi, fc->dev);
+                                   MAJOR(fc->dev), MINOR(fc->dev));
-                }
+        } else {
-                if (err)
+                err = bdi_register_dev(&fc->bdi, fc->dev);
-                        goto error_bdi_destroy;
-                /*
-                 * For a single fuse filesystem use max 1% of dirty +
-                 * writeback threshold.
-                 *
-                 * This gives about 1M of write buffer for memory maps on a
-                 * machine with 1G and 10% dirty_ratio, which should be more
-                 * than enough.
-                 *
-                 * Privileged users can raise it by writing to
-                 *
-                 *    /sys/class/bdi/<bdi>/max_ratio
-                 */
-                bdi_set_max_ratio(&fc->bdi, 1);
-                fc->reqctr = 0;
-                fc->blocked = 1;
-                fc->attr_version = 1;
-                get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
        }
-        return fc;
+        if (err)
+                goto error_bdi_destroy;
+        /*
+         * For a single fuse filesystem use max 1% of dirty +
+         * writeback threshold.
+         *
+         * This gives about 1M of write buffer for memory maps on a
+         * machine with 1G and 10% dirty_ratio, which should be more
+         * than enough.
+         *
+         * Privileged users can raise it by writing to
+         *
+         *    /sys/class/bdi/<bdi>/max_ratio
+         */
+        bdi_set_max_ratio(&fc->bdi, 1);
+        fc->reqctr = 0;
+        fc->blocked = 1;
+        fc->attr_version = 1;
+        get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-error_bdi_destroy:
+        return 0;
+ error_bdi_destroy:
        bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
        mutex_destroy(&fc->inst_mutex);
-        kfree(fc);
+        return err;
-        return NULL;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_init);
 void fuse_conn_put(struct fuse_conn *fc)
 {
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
                        fuse_request_free(fc->destroy_req);
                mutex_destroy(&fc->inst_mutex);
                bdi_destroy(&fc->bdi);
-                kfree(fc);
+                fc->release(fc);
        }
 }
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
        return fc;
 }
-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
        struct fuse_attr attr;
        memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
        return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
-struct fuse_inode_handle
+struct fuse_inode_handle {
-{
        u64 nodeid;
        u32 generation;
 };
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                fc->max_write = max_t(unsigned, 4096, fc->max_write);
                fc->conn_init = 1;
        }
-        fuse_put_request(fc, req);
        fc->blocked = 0;
        wake_up_all(&fc->blocked_waitq);
 }
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        req->out.args[0].size = sizeof(struct fuse_init_out);
        req->out.args[0].value = &req->misc.init_out;
        req->end = process_init_reply;
-        request_send_background(fc, req);
+        fuse_request_send_background(fc, req);
+}
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+        kfree(fc);
 }
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (file->f_op != &fuse_dev_operations)
                return -EINVAL;
-        fc = new_conn(sb);
+        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
        if (!fc)
                return -ENOMEM;
+        err = fuse_conn_init(fc, sb);
+        if (err) {
+                kfree(fc);
+                return err;
+        }
+        fc->release = fuse_free_conn;
        fc->flags = d.flags;
        fc->user_id = d.user_id;
        fc->group_id = d.group_id;
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = fc;
        err = -ENOMEM;
-        root = get_root_inode(sb, d.rootmode);
+        root = fuse_get_root_inode(sb, d.rootmode);
        if (!root)
                goto err;
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void)
 static void fuse_inode_init_once(void *foo)
 {
-        struct inode * inode = foo;
+        struct inode *inode = foo;
        inode_init_once(inode);
 }
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void)
 {
        int res;
-        printk("fuse init (API version %i.%i)\n",
+        printk(KERN_INFO "fuse init (API version %i.%i)\n",
               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
        INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
+        depends on EXPERIMENTAL && (64BIT || LBD)
        select FS_POSIX_ACL
        select CRC32
        help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
        struct gfs2_ea_location el_this;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return 0;
        memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
                void *kaddr = kmap(page);
                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-                       ip->i_di.di_size);
+                       ip->i_disksize);
-                memset(kaddr + ip->i_di.di_size, 0,
+                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_di.di_size);
+                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        if (error)
                goto out;
-        if (ip->i_di.di_size) {
+        if (ip->i_disksize) {
                /* Get a free block, fill it with the stuffed data,
                   and write it out to disk */
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        di = (struct gfs2_dinode *)dibh->b_data;
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-        if (ip->i_di.di_size) {
+        if (ip->i_disksize) {
                *(__be64 *)(di + 1) = cpu_to_be64(block);
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
                }
        }
-        ip->i_di.di_size = size;
+        ip->i_disksize = size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                ip->i_di.di_size = size;
+                ip->i_disksize = size;
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
                if (!error) {
-                        ip->i_di.di_size = size;
+                        ip->i_disksize = size;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                        ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+                        ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                        gfs2_dinode_out(ip, dibh->b_data);
                }
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
        if (error)
                goto out;
-        if (!ip->i_di.di_size) {
+        if (!ip->i_disksize) {
                ip->i_height = 0;
                ip->i_goal = ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        }
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+        ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
                return -EINVAL;
-        if (size > ip->i_di.di_size)
+        if (size > ip->i_disksize)
                error = do_grow(ip, size);
-        else if (size < ip->i_di.di_size)
+        else if (size < ip->i_disksize)
                error = do_shrink(ip, size);
        else
                /* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
        int error;
-        error = trunc_dealloc(ip, ip->i_di.di_size);
+        error = trunc_dealloc(ip, ip->i_disksize);
        if (!error)
                error = trunc_end(ip);
        return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 }
 /**
- * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
- * @ip: the file
- * @len: the number of bytes to be written to the file
- * @data_blocks: returns the number of data blocks required
- * @ind_blocks: returns the number of indirect blocks required
- *
- */
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int tmp;
-        if (gfs2_is_dir(ip)) {
-                *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
-                *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
-        } else {
-                *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
-                *ind_blocks = 3 * (sdp->sd_max_height - 1);
-        }
-        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
-                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-                *ind_blocks += tmp;
-        }
-}
-/**
 * gfs2_write_alloc_required - figure out if a write will require an allocation
 * @ip: the file being written to
 * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        struct buffer_head bh;
        unsigned int shift;
        u64 lblock, lblock_stop, size;
+        u64 end_of_file;
        *alloc_required = 0;
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        *alloc_required = 1;
        shift = sdp->sd_sb.sb_bsize_shift;
-        if (gfs2_is_dir(ip)) {
+        BUG_ON(gfs2_is_dir(ip));
-                unsigned int bsize = sdp->sd_jbsize;
+        end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock = offset;
+        lblock = offset >> shift;
-                do_div(lblock, bsize);
+        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock_stop = offset + len + bsize - 1;
+        if (lblock_stop > end_of_file)
-                do_div(lblock_stop, bsize);
+                return 0;
-        } else {
-                u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock = offset >> shift;
-                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                if (lblock_stop > end_of_file)
-                        return 0;
-        }
        size = (lblock_stop - lblock) << shift;
        do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
 #ifndef __BMAP_DOT_H__
 #define __BMAP_DOT_H__
+#include "inode.h"
 struct inode;
 struct gfs2_inode;
 struct page;
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+                                          unsigned int len,
+                                          unsigned int *data_blocks,
+                                          unsigned int *ind_blocks)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int tmp;
+        BUG_ON(gfs2_is_dir(ip));
+        *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+        *ind_blocks = 3 * (sdp->sd_max_height - 1);
+        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                *ind_blocks += tmp;
+        }
+}
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-                            unsigned int *data_blocks,
-                            unsigned int *ind_blocks);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                              unsigned int len, int *alloc_required);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include <linux/freezer.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "daemon.h"
-#include "glock.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "super.h"
-#include "util.h"
-/* This uses schedule_timeout() instead of msleep() because it's good for
-   the daemons to wake up more often than the timeout when unmounting so
-   the user's unmount doesn't sit there forever.
-   The kthread functions used to start these daemons block and flush signals. */
-/**
- * gfs2_glockd - Reclaim unused glock structures
- * @sdp: Pointer to GFS2 superblock
- *
- * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
- * Number of daemons can be set by user, with num_glockd mount option.
- */
-int gfs2_glockd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        while (!kthread_should_stop()) {
-                while (atomic_read(&sdp->sd_reclaim_count))
-                        gfs2_reclaim_glock(sdp);
-                wait_event_interruptible(sdp->sd_reclaim_wq,
-                                         (atomic_read(&sdp->sd_reclaim_count) ||
-                                         kthread_should_stop()));
-                if (freezing(current))
-                        refrigerator();
-        }
-        return 0;
-}
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_recoverd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        unsigned long t;
-        while (!kthread_should_stop()) {
-                gfs2_check_journals(sdp);
-                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
-/**
- * gfs2_quotad - Write cached quota changes into the quota file
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_quotad(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        unsigned long t;
-        int error;
-        while (!kthread_should_stop()) {
-                /* Update the master statfs file */
-                t = sdp->sd_statfs_sync_time +
-                    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        error = gfs2_statfs_sync(sdp);
-                        if (error &&
-                            error != -EROFS &&
-                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                                fs_err(sdp, "quotad: (1) error=%d\n", error);
-                        sdp->sd_statfs_sync_time = jiffies;
-                }
-                /* Update quota file */
-                t = sdp->sd_quota_sync_time +
-                    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        error = gfs2_quota_sync(sdp);
-                        if (error &&
-                            error != -EROFS &&
-                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                                fs_err(sdp, "quotad: (2) error=%d\n", error);
-                        sdp->sd_quota_sync_time = jiffies;
-                }
-                gfs2_quota_scan(sdp);
-                t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __DAEMON_DOT_H__
-#define __DAEMON_DOT_H__
-int gfs2_glockd(void *data);
-int gfs2_recoverd(void *data);
-int gfs2_quotad(void *data);
-#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
 * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
 * beginning of the leaf block. The dirents reside in leaves when
 *
- * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
 *
 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
 *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-        if (ip->i_di.di_size < offset + size)
+        if (ip->i_disksize < offset + size)
-                ip->i_di.di_size = offset + size;
+                ip->i_disksize = offset + size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
@@ -226,8 +226,8 @@ out:
        if (error)
                return error;
-        if (ip->i_di.di_size < offset + copied)
+        if (ip->i_disksize < offset + copied)
-                ip->i_di.di_size = offset + copied;
+                ip->i_disksize = offset + copied;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
        int copied = 0;
        int error = 0;
-        if (offset >= ip->i_di.di_size)
+        if (offset >= ip->i_disksize)
                return 0;
-        if (offset + size > ip->i_di.di_size)
+        if (offset + size > ip->i_disksize)
-                size = ip->i_di.di_size - offset;
+                size = ip->i_disksize - offset;
        if (!size)
                return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
        struct gfs2_inode *ip = GFS2_I(inode);
        int error;
-        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf;
                unsigned hsize = 1 << ip->i_depth;
                unsigned index;
                u64 ln;
-                if (hsize * sizeof(u64) != ip->i_di.di_size) {
+                if (hsize * sizeof(u64) != ip->i_disksize) {
                        gfs2_consist_inode(ip);
                        return ERR_PTR(-EIO);
                }
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
                return -ENOSPC;
        bn = bh->b_blocknr;
-        gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+        gfs2_assert(sdp, dip->i_entries < (1 << 16));
-        leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+        leaf->lf_entries = cpu_to_be16(dip->i_entries);
        /*  Copy dirents  */
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
        for (x = sdp->sd_hash_ptrs; x--; lp++)
                *lp = cpu_to_be64(bn);
-        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+        dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
        gfs2_add_inode_blocks(&dip->i_inode, 1);
-        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+        dip->i_diskflags |= GFS2_DIF_EXHASH;
        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
        dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
-        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+        for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
                                            block * sdp->sd_hash_bsize,
                                            sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        unsigned depth = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
        int copied = 0;
        int error;
-        if (!dip->i_di.di_entries)
+        if (!dip->i_entries)
                return 0;
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+        if (dip->i_diskflags & GFS2_DIF_EXHASH)
                return dir_e_read(inode, offset, opaque, filldir);
        if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                        error = PTR_ERR(dent);
                        goto out;
                }
-                if (dip->i_di.di_entries != g.offset) {
+                if (dip->i_entries != g.offset) {
                        fs_warn(sdp, "Number of entries corrupt in dir %llu, "
-                                "ip->i_di.di_entries (%u) != g.offset (%u)\n",
+                                "ip->i_entries (%u) != g.offset (%u)\n",
                                (unsigned long long)dip->i_no_addr,
-                                dip->i_di.di_entries,
+                                dip->i_entries,
                                g.offset);
                        error = -EIO;
                        goto out;
                }
                error = do_filldir_main(dip, offset, opaque, filldir, darr,
-                                        dip->i_di.di_entries, &copied);
+                                        dip->i_entries, &copied);
 out:
                kfree(darr);
        }
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
                        dent->de_type = cpu_to_be16(type);
-                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        if (error)
                                break;
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        ip->i_di.di_entries++;
+                        ip->i_entries++;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                        gfs2_dinode_out(ip, bh->b_data);
                        brelse(bh);
                        error = 0;
                        break;
                }
-                if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
                        error = dir_make_exhash(inode);
                        if (error)
                                break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        }
        dirent_del(dip, bh, prev, dent);
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (dip->i_diskflags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
                u16 entries = be16_to_cpu(leaf->lf_entries);
                if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        if (error)
                return error;
-        if (!dip->i_di.di_entries)
+        if (!dip->i_entries)
                gfs2_consist_inode(dip);
        gfs2_trans_add_bh(dip->i_gl, bh, 1);
-        dip->i_di.di_entries--;
+        dip->i_entries--;
        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(dip, bh->b_data);
        brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
        gfs2_inum_out(nip, dent);
        dent->de_type = cpu_to_be16(new_type);
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (dip->i_diskflags & GFS2_DIF_EXHASH) {
                brelse(bh);
                error = gfs2_meta_inode_buffer(dip, &bh);
                if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
 #define __DIR_DOT_H__
 #include <linux/dcache.h>
+#include <linux/crc32.h>
 struct inode;
 struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
        __be64 *eablk, *end;
        int error;
-        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+        error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
        if (error)
                return error;
-        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
                error = ea_foreach_i(ip, bh, ea_call, data);
                goto out;
        }
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        if (error)
                return error;
-        if (ip->i_di.di_eattr) {
+        if (ip->i_eattr) {
                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
                error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return -ENODATA;
        error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        ip->i_di.di_eattr = bh->b_blocknr;
+        ip->i_eattr = bh->b_blocknr;
        error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
        brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        int error;
        int mh_size = sizeof(struct gfs2_meta_header);
-        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                __be64 *end;
-                error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+                error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
                                       &indbh);
                if (error)
                        return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                gfs2_buffer_clear_tail(indbh, mh_size);
                eablk = (__be64 *)(indbh->b_data + mh_size);
-                *eablk = cpu_to_be64(ip->i_di.di_eattr);
+                *eablk = cpu_to_be64(ip->i_eattr);
-                ip->i_di.di_eattr = blk;
+                ip->i_eattr = blk;
-                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+                ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
                blks++;
        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr) {
+        if (!ip->i_eattr) {
                if (er->er_flags & XATTR_REPLACE)
                        return -ENODATA;
                return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
                return error;
        if (el.el_ea) {
-                if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
                        brelse(el.el_bh);
                        return -EPERM;
                }
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return -ENODATA;
        error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+        error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
        if (error)
                return error;
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
-        ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+        ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        struct buffer_head *dibh;
        int error;
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
        if (!rgd) {
                gfs2_consist_inode(ip);
                return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        if (error)
                goto out_gunlock;
-        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+        gfs2_free_meta(ip, ip->i_eattr, 1);
-        ip->i_di.di_eattr = 0;
+        ip->i_eattr = 0;
        gfs2_add_inode_blocks(&ip->i_inode, -1);
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
        if (error)
                goto out_rindex;
-        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                error = ea_dealloc_indirect(ip);
                if (error)
                        goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
 #include "quota.h"
 #include "super.h"
 #include "util.h"
+#include "bmap.h"
 struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
-static struct task_struct *scand_process;
-static unsigned int scand_secs = 5;
 static struct workqueue_struct *glock_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&lru_lock);
+        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+                list_add_tail(&gl->gl_lru, &lru_list);
+                atomic_inc(&lru_count);
+        }
+        spin_unlock(&lru_lock);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
+                spin_lock(&lru_lock);
+                if (!list_empty(&gl->gl_lru)) {
+                        list_del_init(&gl->gl_lru);
+                        atomic_dec(&lru_count);
+                }
+                spin_unlock(&lru_lock);
                GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                glock_free(gl);
                rv = 1;
                goto out;
        }
        write_unlock(gl_lock_addr(gl->gl_hash));
+        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        if (atomic_read(&gl->gl_ref) == 2)
+                gfs2_glock_schedule_for_reclaim(gl);
 out:
        return rv;
 }
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
 * do_promote - promote as many requests as possible on the current queue
 * @gl: The glock
 * 
- * Returns: true if there is a blocked holder at the head of the list
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
 */
 static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
                                ret = glops->go_lock(gh);
                                spin_lock(&gl->gl_spin);
                                if (ret) {
+                                        if (ret == 1)
+                                                return 2;
                                        gh->gh_error = ret;
                                        list_del_init(&gh->gh_list);
                                        gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh;
        unsigned state = ret & LM_OUT_ST_MASK;
+        int rv;
        spin_lock(&gl->gl_spin);
        state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
                gfs2_demote_wake(gl);
        if (state != LM_ST_UNLOCKED) {
                if (glops->go_xmote_bh) {
-                        int rv;
                        spin_unlock(&gl->gl_spin);
                        rv = glops->go_xmote_bh(gl, gh);
                        if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
                                goto out;
                        }
                }
-                do_promote(gl);
+                rv = do_promote(gl);
+                if (rv == 2)
+                        goto out_locked;
        }
 out:
        clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
        spin_unlock(&gl->gl_spin);
        gfs2_glock_put(gl);
 }
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 */
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
 */
 static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        struct gfs2_holder *gh = NULL;
+        int ret;
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
                return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
        } else {
                if (test_bit(GLF_DEMOTE, &gl->gl_flags))
                        gfs2_demote_wake(gl);
-                if (do_promote(gl) == 0)
+                ret = do_promote(gl);
+                if (ret == 0)
                        goto out;
+                if (ret == 2)
+                        return;
                gh = find_first_waiter(gl);
                gl->gl_target = gh->gh_state;
                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
 */
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-                            int remote, unsigned long delay)
+                            unsigned long delay)
 {
        int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
        if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
                gl->gl_demote_state = state;
                gl->gl_demote_time = jiffies;
-                if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-                    gl->gl_object)
-                        gfs2_glock_schedule_for_reclaim(gl);
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 */
 static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        spin_lock(&gl->gl_spin);
        if (gh->gh_flags & GL_NOCACHE)
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0);
        list_del_init(&gh->gh_list);
        if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
                delay = gl->gl_ops->go_min_hold_time;
        spin_lock(&gl->gl_spin);
-        handle_callback(gl, state, 1, delay);
+        handle_callback(gl, state, delay);
        spin_unlock(&gl->gl_spin);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
 }
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_jid != jid)
+                        continue;
+                jd->jd_dirty = 1;
+                break;
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+}
 /**
 * gfs2_glock_cb - Callback used by locking module
 * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 * Returns: 1 if it's ok
 */
-static int demote_ok(struct gfs2_glock *gl)
+static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        int demote = 1;
-        if (test_bit(GLF_STICKY, &gl->gl_flags))
-                demote = 0;
-        else if (glops->go_demote_ok)
-                demote = glops->go_demote_ok(gl);
-        return demote;
-}
-/**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- */
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        spin_lock(&sdp->sd_reclaim_lock);
+        if (gl->gl_state == LM_ST_UNLOCKED)
-        if (list_empty(&gl->gl_reclaim)) {
+                return 0;
-                gfs2_glock_hold(gl);
+        if (!list_empty(&gl->gl_holders))
-                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+                return 0;
-                atomic_inc(&sdp->sd_reclaim_count);
+        if (glops->go_demote_ok)
-                spin_unlock(&sdp->sd_reclaim_lock);
+                return glops->go_demote_ok(gl);
-                wake_up(&sdp->sd_reclaim_wq);
+        return 1;
-        } else
-                spin_unlock(&sdp->sd_reclaim_lock);
 }
-/**
- * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
- * @sdp: the filesystem
- *
- * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
- * different glock and we notice that there are a lot of glocks in the
- * reclaim list.
- *
- */
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
-        int done_callback = 0;
+        int may_demote;
+        int nr_skipped = 0;
+        int got_ref = 0;
+        LIST_HEAD(skipped);
-        spin_lock(&sdp->sd_reclaim_lock);
+        if (nr == 0)
-        if (list_empty(&sdp->sd_reclaim_list)) {
+                goto out;
-                spin_unlock(&sdp->sd_reclaim_lock);
-                return;
-        }
-        gl = list_entry(sdp->sd_reclaim_list.next,
-                        struct gfs2_glock, gl_reclaim);
-        list_del_init(&gl->gl_reclaim);
-        spin_unlock(&sdp->sd_reclaim_lock);
-        atomic_dec(&sdp->sd_reclaim_count);
+        if (!(gfp_mask & __GFP_FS))
-        atomic_inc(&sdp->sd_reclaimed);
+                return -1;
-        spin_lock(&gl->gl_spin);
+        spin_lock(&lru_lock);
-        if (find_first_holder(gl) == NULL &&
+        while(nr && !list_empty(&lru_list)) {
-            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+                gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                list_del_init(&gl->gl_lru);
-                done_callback = 1;
+                atomic_dec(&lru_count);
+                /* Test for being demotable */
+                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                        gfs2_glock_hold(gl);
+                        got_ref = 1;
+                        spin_unlock(&lru_lock);
+                        spin_lock(&gl->gl_spin);
+                        may_demote = demote_ok(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
+                        if (may_demote) {
+                                handle_callback(gl, LM_ST_UNLOCKED, 0);
+                                nr--;
+                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                        gfs2_glock_put(gl);
+                        }
+                        spin_lock(&lru_lock);
+                        if (may_demote)
+                                continue;
+                }
+                if (list_empty(&gl->gl_lru) &&
+                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
+                        nr_skipped++;
+                        list_add(&gl->gl_lru, &skipped);
+                }
+                if (got_ref) {
+                        spin_unlock(&lru_lock);
+                        gfs2_glock_put(gl);
+                        spin_lock(&lru_lock);
+                        got_ref = 0;
+                }
        }
-        spin_unlock(&gl->gl_spin);
+        list_splice(&skipped, &lru_list);
-        if (!done_callback ||
+        atomic_add(nr_skipped, &lru_count);
-            queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+        spin_unlock(&lru_lock);
-                gfs2_glock_put(gl);
+out:
+        return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
+static struct shrinker glock_shrinker = {
+        .shrink = gfs2_shrink_glock_memory,
+        .seeks = DEFAULT_SEEKS,
+};
 /**
 * examine_bucket - Call a function for glock in a hash bucket
 * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
 }
 /**
- * scan_glock - look at a glock and see if we can reclaim it
- * @gl: the glock to look at
- *
- */
-static void scan_glock(struct gfs2_glock *gl)
-{
-        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
-                return;
-        if (test_bit(GLF_LOCK, &gl->gl_flags))
-                return;
-        spin_lock(&gl->gl_spin);
-        if (find_first_holder(gl) == NULL &&
-            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-                gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-}
-/**
 * clear_glock - look at a glock and see if we can free it from glock cache
 * @gl: the glock to look at
 *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
 static void clear_glock(struct gfs2_glock *gl)
 {
-        struct gfs2_sbd *sdp = gl->gl_sbd;
+        spin_lock(&lru_lock);
-        int released;
+        if (!list_empty(&gl->gl_lru)) {
+                list_del_init(&gl->gl_lru);
-        spin_lock(&sdp->sd_reclaim_lock);
+                atomic_dec(&lru_count);
-        if (!list_empty(&gl->gl_reclaim)) {
-                list_del_init(&gl->gl_reclaim);
-                atomic_dec(&sdp->sd_reclaim_count);
-                spin_unlock(&sdp->sd_reclaim_lock);
-                released = gfs2_glock_put(gl);
-                gfs2_assert(sdp, !released);
-        } else {
-                spin_unlock(&sdp->sd_reclaim_lock);
        }
+        spin_unlock(&lru_lock);
        spin_lock(&gl->gl_spin);
        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0);
        spin_unlock(&gl->gl_spin);
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
        }
 }
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+        struct gfs2_glock *gl = ip->i_gl;
+        int ret;
+        ret = gfs2_truncatei_resume(ip);
+        gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+        spin_lock(&gl->gl_spin);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl, 1);
+        spin_unlock(&gl->gl_spin);
+}
 static const char *state2str(unsigned state)
 {
        switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
        char *p = buf;
        if (test_bit(GLF_LOCK, gflags))
                *p++ = 'l';
-        if (test_bit(GLF_STICKY, gflags))
-                *p++ = 's';
        if (test_bit(GLF_DEMOTE, gflags))
                *p++ = 'D';
        if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
        return error;
 }
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-static int gfs2_scand(void *data)
-{
-        unsigned x;
-        unsigned delay;
-        while (!kthread_should_stop()) {
-                for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                        examine_bucket(scan_glock, NULL, x);
-                if (freezing(current))
-                        refrigerator();
-                delay = scand_secs;
-                if (delay < 1)
-                        delay = 1;
-                schedule_timeout_interruptible(delay * HZ);
-        }
-        return 0;
-}
 int __init gfs2_glock_init(void)
 {
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
-        if (IS_ERR(scand_process))
-                return PTR_ERR(scand_process);
        glock_workqueue = create_workqueue("glock_workqueue");
-        if (IS_ERR(glock_workqueue)) {
+        if (IS_ERR(glock_workqueue))
-                kthread_stop(scand_process);
                return PTR_ERR(glock_workqueue);
-        }
+        register_shrinker(&glock_shrinker);
        return 0;
 }
 void gfs2_glock_exit(void)
 {
+        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
-        kthread_stop(scand_process);
 }
-module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 * Returns: 1 if it's ok
 */
-static int inode_go_demote_ok(struct gfs2_glock *gl)
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int demote = 0;
+        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+                return 0;
-        if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+        return 1;
-                demote = 1;
-        else if (!sdp->sd_args.ar_localcaching &&
-                 time_after_eq(jiffies, gl->gl_stamp +
-                               gfs2_tune_get(sdp, gt_demote_secs) * HZ))
-                demote = 1;
-        return demote;
 }
 /**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_inode *ip = gl->gl_object;
        int error = 0;
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
                        return error;
        }
-        if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+        if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
            (gl->gl_state == LM_ST_EXCLUSIVE) &&
-            (gh->gh_state == LM_ST_EXCLUSIVE))
+            (gh->gh_state == LM_ST_EXCLUSIVE)) {
-                error = gfs2_truncatei_resume(ip);
+                spin_lock(&sdp->sd_trunc_lock);
+                if (list_empty(&ip->i_trunc_list))
+                        list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+                spin_unlock(&sdp->sd_trunc_lock);
+                wake_up(&sdp->sd_quota_wait);
+                return 1;
+        }
        return error;
 }
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_inode *ip = gl->gl_object;
        if (ip == NULL)
                return 0;
-        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
                  (unsigned long long)ip->i_no_formal_ino,
                  (unsigned long long)ip->i_no_addr,
-                  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+                  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+                  (unsigned int)ip->i_diskflags,
+                  (unsigned long long)ip->i_inode.i_size,
+                  (unsigned long long)ip->i_disksize);
        return 0;
 }
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 * Returns: 1 if it's ok
 */
-static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
        return !gl->gl_aspace->i_mapping->nrpages;
 }
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_rgrpd *rgd = gl->gl_object;
        if (rgd == NULL)
                return 0;
-        gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
        return 0;
 }
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 }
 /**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+        return 0;
+}
+/**
 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
 * @gl: the glock
 *
 * Returns: 1 if it's ok
 */
-static int quota_go_demote_ok(struct gfs2_glock *gl)
+static int quota_go_demote_ok(const struct gfs2_glock *gl)
 {
        return !atomic_read(&gl->gl_lvb_count);
 }
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 const struct gfs2_glock_operations gfs2_trans_glops = {
        .go_xmote_th = trans_go_sync,
        .go_xmote_bh = trans_go_xmote_bh,
+        .go_demote_ok = trans_go_demote_ok,
        .go_type = LM_TYPE_NONDISK,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
        u32 bi_len;
 };
-struct gfs2_rgrp_host {
-        u32 rg_free;
-        u32 rg_dinodes;
-        u64 rg_igeneration;
-};
 struct gfs2_rgrpd {
        struct list_head rd_list;       /* Link with superblock */
        struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
        u32 rd_length;                  /* length of rgrp header in fs blocks */
        u32 rd_data;                    /* num of data blocks in rgrp */
        u32 rd_bitbytes;                /* number of bytes in data bitmaps */
-        struct gfs2_rgrp_host rd_rg;
+        u32 rd_free;
+        u32 rd_free_clone;
+        u32 rd_dinodes;
+        u64 rd_igeneration;
        struct gfs2_bitmap *rd_bits;
-        unsigned int rd_bh_count;
        struct mutex rd_mutex;
-        u32 rd_free_clone;
        struct gfs2_log_element rd_le;
-        u32 rd_last_alloc;
        struct gfs2_sbd *rd_sbd;
+        unsigned int rd_bh_count;
+        u32 rd_last_alloc;
        unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
 #define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
-        int (*go_demote_ok) (struct gfs2_glock *gl);
+        int (*go_demote_ok) (const struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
 enum {
        GLF_LOCK                        = 1,
-        GLF_STICKY                      = 2,
        GLF_DEMOTE                      = 3,
        GLF_PENDING_DEMOTE              = 4,
        GLF_DEMOTE_IN_PROGRESS          = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
        unsigned long gl_tchange;
        void *gl_object;
-        struct list_head gl_reclaim;
+        struct list_head gl_lru;
        struct gfs2_sbd *gl_sbd;
@@ -233,29 +228,24 @@ enum {
        GIF_USER                = 4, /* user inode, not metadata addr space */
 };
-struct gfs2_dinode_host {
-        u64 di_size;            /* number of bytes in file */
-        u64 di_generation;      /* generation number for NFS */
-        u32 di_flags;           /* GFS2_DIF_... */
-        /* These only apply to directories  */
-        u32 di_entries;         /* The number of entries in the directory */
-        u64 di_eattr;           /* extended attribute block number */
-};
 struct gfs2_inode {
        struct inode i_inode;
        u64 i_no_addr;
        u64 i_no_formal_ino;
+        u64 i_generation;
+        u64 i_eattr;
+        loff_t i_disksize;
        unsigned long i_flags;          /* GIF_... */
-        struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
        struct gfs2_glock *i_gl; /* Move into i_gh? */
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
        struct gfs2_alloc *i_alloc;
        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
+        struct list_head i_trunc_list;
+        u32 i_entries;
+        u32 i_diskflags;
        u8 i_height;
        u8 i_depth;
 };
@@ -406,13 +396,11 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_demote_secs; /* Cache retention for unheld glock */
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
-        unsigned int gt_quotad_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
        unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
        /* Lock Stuff */
        struct lm_lockstruct sd_lockstruct;
-        struct list_head sd_reclaim_list;
-        spinlock_t sd_reclaim_lock;
-        wait_queue_head_t sd_reclaim_wq;
-        atomic_t sd_reclaim_count;
        struct gfs2_holder sd_live_gh;
        struct gfs2_glock *sd_rename_gl;
        struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
        spinlock_t sd_statfs_spin;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
-        unsigned long sd_statfs_sync_time;
        /* Resource group stuff */
@@ -552,8 +535,6 @@ struct gfs2_sbd {
        struct task_struct *sd_recoverd_process;
        struct task_struct *sd_logd_process;
        struct task_struct *sd_quotad_process;
-        struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
-        unsigned int sd_glockd_num;
        /* Quota stuff */
@@ -561,13 +542,15 @@ struct gfs2_sbd {
        atomic_t sd_quota_count;
        spinlock_t sd_quota_spin;
        struct mutex sd_quota_mutex;
+        wait_queue_head_t sd_quota_wait;
+        struct list_head sd_trunc_list;
+        spinlock_t sd_trunc_lock;
        unsigned int sd_quota_slots;
        unsigned int sd_quota_chunks;
        unsigned char **sd_quota_bitmap;
        u64 sd_quota_sync_gen;
-        unsigned long sd_quota_sync_time;
        /* Log stuff */
@@ -624,10 +607,6 @@ struct gfs2_sbd {
        struct mutex sd_freeze_lock;
        unsigned int sd_freeze_count;
-        /* Counters */
-        atomic_t sd_reclaimed;
        char sd_fsname[GFS2_FSNAME_LEN];
        char sd_table_name[GFS2_FSNAME_LEN];
        char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -248,7 +247,6 @@ fail:
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
-        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
        struct timespec atime;
        u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
         * to do that.
         */
        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-        di->di_size = be64_to_cpu(str->di_size);
+        ip->i_disksize = be64_to_cpu(str->di_size);
-        i_size_write(&ip->i_inode, di->di_size);
+        i_size_write(&ip->i_inode, ip->i_disksize);
        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
        atime.tv_sec = be64_to_cpu(str->di_atime);
        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
        ip->i_goal = be64_to_cpu(str->di_goal_meta);
-        di->di_generation = be64_to_cpu(str->di_generation);
+        ip->i_generation = be64_to_cpu(str->di_generation);
-        di->di_flags = be32_to_cpu(str->di_flags);
+        ip->i_diskflags = be32_to_cpu(str->di_flags);
        gfs2_set_inode_flags(&ip->i_inode);
        height = be16_to_cpu(str->di_height);
        if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
                goto corrupt;
        ip->i_depth = (u8)depth;
-        di->di_entries = be32_to_cpu(str->di_entries);
+        ip->i_entries = be32_to_cpu(str->di_entries);
-        di->di_eattr = be64_to_cpu(str->di_eattr);
+        ip->i_eattr = be64_to_cpu(str->di_eattr);
        if (S_ISREG(ip->i_inode.i_mode))
                gfs2_set_aops(&ip->i_inode);
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        gfs2_free_di(rgd, ip);
        gfs2_trans_end(sdp);
-        clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 out_rg_gunlock:
        gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
                return error;
        }
-        if (dip->i_di.di_entries == (u32)-1)
+        if (dip->i_entries == (u32)-1)
                return -EFBIG;
        if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
                return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_flags = 0;
        if (S_ISREG(mode)) {
-                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
        } else if (S_ISDIR(mode)) {
-                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                di->di_flags |= cpu_to_be32(dip->i_diskflags &
                                            GFS2_DIF_INHERIT_JDATA);
        }
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
        struct qstr dotname;
        int error;
-        if (ip->i_di.di_entries != 2) {
+        if (ip->i_entries != 2) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
                return error;
        }
-        if (!ip->i_di.di_size) {
+        if (!ip->i_disksize) {
                gfs2_consist_inode(ip);
                error = -EIO;
                goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        if (error)
                goto out;
-        x = ip->i_di.di_size + 1;
+        x = ip->i_disksize + 1;
        if (x > *len) {
                *buf = kmalloc(x, GFP_NOFS);
                if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
-        const struct gfs2_dinode_host *di = &ip->i_di;
        struct gfs2_dinode *str = buf;
        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-        str->di_size = cpu_to_be64(di->di_size);
+        str->di_size = cpu_to_be64(ip->i_disksize);
        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_goal_meta = cpu_to_be64(ip->i_goal);
        str->di_goal_data = cpu_to_be64(ip->i_goal);
-        str->di_generation = cpu_to_be64(di->di_generation);
+        str->di_generation = cpu_to_be64(ip->i_generation);
-        str->di_flags = cpu_to_be32(di->di_flags);
+        str->di_flags = cpu_to_be32(ip->i_diskflags);
        str->di_height = cpu_to_be16(ip->i_height);
        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
-                                             !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+                                             !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
                                             GFS2_FORMAT_DE : 0);
        str->di_depth = cpu_to_be16(ip->i_depth);
-        str->di_entries = cpu_to_be32(di->di_entries);
+        str->di_entries = cpu_to_be32(ip->i_entries);
-        str->di_eattr = cpu_to_be64(di->di_eattr);
+        str->di_eattr = cpu_to_be64(ip->i_eattr);
        str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
        str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
        str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
-        const struct gfs2_dinode_host *di = &ip->i_di;
        printk(KERN_INFO "  no_formal_ino = %llu\n",
               (unsigned long long)ip->i_no_formal_ino);
        printk(KERN_INFO "  no_addr = %llu\n",
               (unsigned long long)ip->i_no_addr);
-        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+        printk(KERN_INFO "  i_disksize = %llu\n",
+               (unsigned long long)ip->i_disksize);
        printk(KERN_INFO "  blocks = %llu\n",
               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
        printk(KERN_INFO "  i_goal = %llu\n",
               (unsigned long long)ip->i_goal);
-        printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
+        printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
        printk(KERN_INFO "  i_height = %u\n", ip->i_height);
        printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
-        printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
+        printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
-        printk(KERN_INFO "  di_eattr = %llu\n",
+        printk(KERN_INFO "  i_eattr = %llu\n",
-               (unsigned long long)di->di_eattr);
+               (unsigned long long)ip->i_eattr);
 }
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
+#include <linux/fs.h>
 #include "util.h"
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
 {
-        return ip->i_di.di_flags & GFS2_DIF_JDATA;
+        return ip->i_diskflags & GFS2_DIF_JDATA;
 }
 static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+extern void gfs2_set_inode_flags(struct inode *inode);
 #endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
 static void gdlm_recovery_done(void *lockspace, unsigned int jid,
                               unsigned int message)
 {
+        char env_jid[20];
+        char env_status[20];
+        char *envp[] = { env_jid, env_status, NULL };
        struct gdlm_ls *ls = lockspace;
        ls->recover_jid_done = jid;
        ls->recover_jid_status = message;
-        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+        sprintf(env_jid, "JID=%d", jid);
+        sprintf(env_status, "RECOVERY=%s",
+                message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 static void gdlm_others_may_mount(void *lockspace)
 {
+        char *message = "FIRSTMOUNT=Done";
+        char *envp[] = { message, NULL };
        struct gdlm_ls *ls = lockspace;
        ls->first_done = 1;
-        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+        kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 /* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
        kobject_put(&ls->kobj);
 }
+static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
+                       struct kobj_uevent_env *env)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
+        add_uevent_var(env, "LOCKPROTO=lock_dlm");
+        return 0;
+}
+static struct kset_uevent_ops gdlm_uevent_ops = {
+        .uevent = gdlm_uevent,
+};
 int gdlm_sysfs_init(void)
 {
-        gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+        gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
        if (!gdlm_kset) {
                printk(KERN_WARNING "%s: can not create kset\n", __func__);
                return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
        inode_init_once(&ip->i_inode);
        init_rwsem(&ip->i_rw_mutex);
+        INIT_LIST_HEAD(&ip->i_trunc_list);
        ip->i_alloc = NULL;
 }
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
        INIT_LIST_HEAD(&gl->gl_holders);
        gl->gl_lvb = NULL;
        atomic_set(&gl->gl_lvb_count, 0);
-        INIT_LIST_HEAD(&gl->gl_reclaim);
+        INIT_LIST_HEAD(&gl->gl_lru);
        INIT_LIST_HEAD(&gl->gl_ail_list);
        atomic_set(&gl->gl_ail_count, 0);
 }
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_rgrpd_cachep)
                goto fail;
+        gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+                                               sizeof(struct gfs2_quota_data),
+                                               0, 0, NULL);
+        if (!gfs2_quotad_cachep)
+                goto fail;
        error = register_filesystem(&gfs2_fs_type);
        if (error)
                goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
 fail:
        gfs2_glock_exit();
+        if (gfs2_quotad_cachep)
+                kmem_cache_destroy(gfs2_quotad_cachep);
        if (gfs2_rgrpd_cachep)
                kmem_cache_destroy(gfs2_rgrpd_cachep);
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
        kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
        Opt_debug,
        Opt_nodebug,
        Opt_upgrade,
-        Opt_num_glockd,
        Opt_acl,
        Opt_noacl,
        Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
        {Opt_debug, "debug"},
        {Opt_nodebug, "nodebug"},
        {Opt_upgrade, "upgrade"},
-        {Opt_num_glockd, "num_glockd=%d"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
        int error = 0;
        if (!remount) {
-                /*  If someone preloaded options, use those instead  */
-                spin_lock(&gfs2_sys_margs_lock);
-                if (gfs2_sys_margs) {
-                        data = gfs2_sys_margs;
-                        gfs2_sys_margs = NULL;
-                }
-                spin_unlock(&gfs2_sys_margs_lock);
                /*  Set some defaults  */
-                args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
                args->ar_quota = GFS2_QUOTA_DEFAULT;
                args->ar_data = GFS2_DATA_DEFAULT;
        }
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
           process them */
        for (options = data; (o = strsep(&options, ",")); ) {
-                int token, option;
+                int token;
                substring_t tmp[MAX_OPT_ARGS];
                if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
                                goto cant_remount;
                        args->ar_upgrade = 1;
                        break;
-                case Opt_num_glockd:
-                        if ((error = match_int(&tmp[0], &option))) {
-                                fs_info(sdp, "problem getting num_glockd\n");
-                                goto out_error;
-                        }
-                        if (remount && option != args->ar_num_glockd)
-                                goto cant_remount;
-                        if (!option || option > GFS2_GLOCKD_MAX) {
-                                fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-                                        GFS2_GLOCKD_MAX, option);
-                                error = -EINVAL;
-                                goto out_error;
-                        }
-                        args->ar_num_glockd = option;
-                        break;
                case Opt_acl:
                        args->ar_posix_acl = 1;
                        sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 15f710f2d4da..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 {
        struct inode *inode = page->mapping->host;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        int error;
+        int ret;
        int done_trans = 0;
-        error = gfs2_writepage_common(page, wbc);
-        if (error <= 0)
-                return error;
        if (PageChecked(page)) {
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out_ignore;
-                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+                ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-                if (error)
+                if (ret)
                        goto out_ignore;
                done_trans = 1;
        }
-        error = __gfs2_jdata_writepage(page, wbc);
+        ret = gfs2_writepage_common(page, wbc);
+        if (ret > 0)
+                ret = __gfs2_jdata_writepage(page, wbc);
        if (done_trans)
                gfs2_trans_end(sdp);
-        return error;
+        return ret;
 out_ignore:
        redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
        kaddr = kmap_atomic(page, KM_USER0);
        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-               ip->i_di.di_size);
+               ip->i_disksize);
-        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-        unsigned int data_blocks, ind_blocks, rblocks;
+        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
        struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(error))
                goto out_uninit;
-        gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
                goto out_unlock;
+        if (alloc_required || gfs2_is_jdata(ip))
+                gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (!al) {
@@ -675,6 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                goto out_trans_fail;
        error = -ENOMEM;
+        flags |= AOP_FLAG_NOFS;
        page = grab_cache_page_write_begin(mapping, index, flags);
        *pagep = page;
        if (unlikely(!page))
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        if (inode->i_size < to) {
                i_size_write(inode, to);
-                ip->i_di.di_size = inode->i_size;
+                ip->i_disksize = inode->i_size;
                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+        if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
                di = (struct gfs2_dinode *)dibh->b_data;
-                ip->i_di.di_size = inode->i_size;
+                ip->i_disksize = inode->i_size;
                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
 #include "incore.h"
 #include "dir.h"
 #include "glock.h"
-#include "ops_dentry.h"
+#include "super.h"
 #include "util.h"
 #include "inode.h"
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_DENTRY_DOT_H__
-#define __OPS_DENTRY_DOT_H__
-#include <linux/dcache.h>
-extern struct dentry_operations gfs2_dops;
-#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "ops_dentry.h"
+#include "super.h"
-#include "ops_fstype.h"
 #include "rgrp.h"
 #include "util.h"
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        }
        error = -EIO;
-        if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
                iput(inode);
                goto fail;
        }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
        if (error)
                return error;
-        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
-        if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+        if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
                fsflags |= FS_JOURNAL_DATA_FL;
        if (put_user(fsflags, ptr))
                error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 void gfs2_set_inode_flags(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_dinode_host *di = &ip->i_di;
        unsigned int flags = inode->i_flags;
        flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-        if (di->di_flags & GFS2_DIF_IMMUTABLE)
+        if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
                flags |= S_IMMUTABLE;
-        if (di->di_flags & GFS2_DIF_APPENDONLY)
+        if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
                flags |= S_APPEND;
-        if (di->di_flags & GFS2_DIF_NOATIME)
+        if (ip->i_diskflags & GFS2_DIF_NOATIME)
                flags |= S_NOATIME;
-        if (di->di_flags & GFS2_DIF_SYNC)
+        if (ip->i_diskflags & GFS2_DIF_SYNC)
                flags |= S_SYNC;
        inode->i_flags = flags;
 }
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
-        flags = ip->i_di.di_flags;
+        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
                goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_trans_end;
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        ip->i_di.di_flags = new_flags;
+        ip->i_diskflags = new_flags;
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
        gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        unsigned long last_index;
-        u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+        u64 pos = page->index << PAGE_CACHE_SHIFT;
        unsigned int data_blocks, ind_blocks, rblocks;
        int alloc_required = 0;
        struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                goto out;
        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
        if (ret || !alloc_required)
                goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        ret = gfs2_quota_lock_check(ip);
        if (ret)
                goto out_alloc_put;
+        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        al->al_requested = data_blocks + ind_blocks;
        ret = gfs2_inplace_reserve(ip);
        if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail;
                if (!(file->f_flags & O_LARGEFILE) &&
-                    ip->i_di.di_size > MAX_NON_LFS) {
+                    ip->i_disksize > MAX_NON_LFS) {
                        error = -EOVERFLOW;
                        goto fail_gunlock;
                }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
-#include "daemon.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
 #include "mount.h"
-#include "ops_fstype.h"
-#include "ops_dentry.h"
-#include "ops_super.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "log.h"
+#include "quota.h"
+#include "dir.h"
 #define DO 0
 #define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_demote_secs = 300;
        gt->gt_incore_log_blocks = 1024;
        gt->gt_log_flush_secs = 60;
        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
-        gt->gt_quotad_secs = 5;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        gfs2_tune_init(&sdp->sd_tune);
-        INIT_LIST_HEAD(&sdp->sd_reclaim_list);
-        spin_lock_init(&sdp->sd_reclaim_lock);
-        init_waitqueue_head(&sdp->sd_reclaim_wq);
        mutex_init(&sdp->sd_inum_mutex);
        spin_lock_init(&sdp->sd_statfs_spin);
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        INIT_LIST_HEAD(&sdp->sd_quota_list);
        spin_lock_init(&sdp->sd_quota_spin);
        mutex_init(&sdp->sd_quota_mutex);
+        init_waitqueue_head(&sdp->sd_quota_wait);
+        INIT_LIST_HEAD(&sdp->sd_trunc_list);
+        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
@@ -443,24 +438,11 @@ out:
 static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
                        int undo)
 {
-        struct task_struct *p;
        int error = 0;
        if (undo)
                goto fail_trans;
-        for (sdp->sd_glockd_num = 0;
-             sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
-             sdp->sd_glockd_num++) {
-                p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
-                error = IS_ERR(p);
-                if (error) {
-                        fs_err(sdp, "can't start glockd thread: %d\n", error);
-                        goto fail;
-                }
-                sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
-        }
        error = gfs2_glock_nq_num(sdp,
                                  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
                                  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
                fs_err(sdp, "can't create transaction glock: %d\n", error);
                goto fail_rename;
        }
-        set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
        return 0;
@@ -506,9 +487,6 @@ fail_live:
 fail_mount:
        gfs2_glock_dq_uninit(mount_gh);
 fail:
-        while (sdp->sd_glockd_num--)
-                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
        return error;
 }
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
        prev_db = 0;
-        for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+        for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
                bh.b_state = 0;
                bh.b_blocknr = 0;
                bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
                                        sdp->sd_lockstruct.ls_lockspace);
 }
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+        struct qstr name;
+        char buf[20];
+        struct gfs2_jdesc *jd;
+        int error;
+        name.name = buf;
+        mutex_lock(&sdp->sd_jindex_mutex);
+        for (;;) {
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+                if (error)
+                        break;
+                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+                name.hash = gfs2_disk_hash(name.name, name.len);
+                error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+                if (error == -ENOENT) {
+                        error = 0;
+                        break;
+                }
+                gfs2_glock_dq_uninit(ji_gh);
+                if (error)
+                        break;
+                error = -ENOMEM;
+                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+                if (!jd)
+                        break;
+                INIT_LIST_HEAD(&jd->extent_list);
+                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+                        if (!jd->jd_inode)
+                                error = -ENOENT;
+                        else
+                                error = PTR_ERR(jd->jd_inode);
+                        kfree(jd);
+                        break;
+                }
+                spin_lock(&sdp->sd_jindex_spin);
+                jd->jd_jid = sdp->sd_journals++;
+                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+                spin_unlock(&sdp->sd_jindex_spin);
+        }
+        mutex_unlock(&sdp->sd_jindex_mutex);
+        return error;
+}
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                return PTR_ERR(sdp->sd_jindex);
        }
        ip = GFS2_I(sdp->sd_jindex);
-        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
        /* Load in the journal index special file */
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
                goto fail_statfs;
        }
        ip = GFS2_I(sdp->sd_rindex);
-        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        }
        sdp->sd_logd_process = p;
-        sdp->sd_statfs_sync_time = jiffies;
-        sdp->sd_quota_sync_time = jiffies;
        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
        error = IS_ERR(p);
        if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        if (sdp) {
-                gfs2_meta_syncfs(sdp);
+        if (sdp == NULL) {
-                dput(sdp->sd_root_dir);
+                kill_block_super(sb);
-                dput(sdp->sd_master_dir);
+                return;
-                sdp->sd_root_dir = NULL;
-                sdp->sd_master_dir = NULL;
        }
+        gfs2_meta_syncfs(sdp);
+        dput(sdp->sd_root_dir);
+        dput(sdp->sd_master_dir);
+        sdp->sd_root_dir = NULL;
+        sdp->sd_master_dir = NULL;
        shrink_dcache_sb(sb);
        kill_block_super(sb);
-        if (sdp)
+        gfs2_delete_debugfs_file(sdp);
-                gfs2_delete_debugfs_file(sdp);
+        kfree(sdp);
 }
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_FSTYPE_DOT_H__
-#define __OPS_FSTYPE_DOT_H__
-#include <linux/fs.h>
-extern struct file_system_type gfs2_fs_type;
-extern struct file_system_type gfs2meta_fs_type;
-extern const struct export_operations gfs2_export_ops;
-#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/fiemap.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -31,12 +32,11 @@
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
-#include "ops_dentry.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "super.h"
 /**
 * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (!dip->i_inode.i_nlink)
                goto out_gunlock;
        error = -EFBIG;
-        if (dip->i_di.di_entries == (u32)-1)
+        if (dip->i_entries == (u32)-1)
                goto out_gunlock;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        ip = ghs[1].gh_gl->gl_object;
-        ip->i_di.di_size = size;
+        ip->i_disksize = size;
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        ip = ghs[1].gh_gl->gl_object;
        ip->i_inode.i_nlink = 2;
-        ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+        ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-        ip->i_di.di_flags |= GFS2_DIF_JDATA;
+        ip->i_diskflags |= GFS2_DIF_JDATA;
-        ip->i_di.di_entries = 2;
+        ip->i_entries = 2;
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                goto out_gunlock;
-        if (ip->i_di.di_entries < 2) {
+        if (ip->i_entries < 2) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                error = -EIO;
                goto out_gunlock;
        }
-        if (ip->i_di.di_entries > 2) {
+        if (ip->i_entries > 2) {
                error = -ENOTEMPTY;
                goto out_gunlock;
        }
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        goto out_gunlock;
                if (S_ISDIR(nip->i_inode.i_mode)) {
-                        if (nip->i_di.di_entries < 2) {
+                        if (nip->i_entries < 2) {
                                if (gfs2_consist_inode(nip))
                                        gfs2_dinode_print(nip);
                                error = -EIO;
                                goto out_gunlock;
                        }
-                        if (nip->i_di.di_entries > 2) {
+                        if (nip->i_entries > 2) {
                                error = -ENOTEMPTY;
                                goto out_gunlock;
                        }
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                                error = -EINVAL;
                                goto out_gunlock;
                        }
-                        if (ndip->i_di.di_entries == (u32)-1) {
+                        if (ndip->i_entries == (u32)-1) {
                                error = -EFBIG;
                                goto out_gunlock;
                        }
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        int error;
-        if (attr->ia_size != ip->i_di.di_size) {
+        if (attr->ia_size != ip->i_disksize) {
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
        }
        error = gfs2_truncatei(ip, attr->ia_size);
-        if (error && (inode->i_size != ip->i_di.di_size))
+        if (error && (inode->i_size != ip->i_disksize))
-                i_size_write(inode, ip->i_di.di_size);
+                i_size_write(inode, ip->i_disksize);
        return error;
 }
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
 }
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                       u64 start, u64 len)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+        if (ret)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                u64 phys = ip->i_no_addr << inode->i_blkbits;
+                u64 size = i_size_read(inode);
+                u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+                            FIEMAP_EXTENT_DATA_INLINE;
+                phys += sizeof(struct gfs2_dinode);
+                phys += start;
+                if (start + len > size)
+                        len = size - start;
+                if (start < size)
+                        ret = fiemap_fill_next_extent(fieinfo, start, phys,
+                                                      len, flags);
+                if (ret == 1)
+                        ret = 0;
+        } else {
+                ret = __generic_block_fiemap(inode, fieinfo, start, len,
+                                             gfs2_block_map);
+        }
+        gfs2_glock_dq_uninit(&gh);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct inode_operations gfs2_file_iops = {
        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
 const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
 const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_INODE_DOT_H__
-#define __OPS_INODE_DOT_H__
-#include <linux/fs.h>
-extern const struct inode_operations gfs2_file_iops;
-extern const struct inode_operations gfs2_dir_iops;
-extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
-extern void gfs2_set_inode_flags(struct inode *inode);
-#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..777783deddcb 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "mount.h"
-#include "ops_super.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
        kthread_stop(sdp->sd_quotad_process);
        kthread_stop(sdp->sd_logd_process);
        kthread_stop(sdp->sd_recoverd_process);
-        while (sdp->sd_glockd_num--)
-                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
        /*  At this point, we're through participating in the lockspace  */
        gfs2_sys_fs_del(sdp);
-        kfree(sdp);
 }
 /**
@@ -260,6 +256,137 @@ static void gfs2_unlockfs(struct super_block *sb)
 }
 /**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change_host *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_data;
+        sc->sc_free += rgd->rd_free;
+        sc->sc_dinodes += rgd->rd_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
+}
+/**
 * gfs2_statfs - Gather and return stats about the filesystem
 * @sb: The superblock
 * @statfsbuf: The buffer
@@ -370,7 +497,6 @@ static void gfs2_clear_inode(struct inode *inode)
         */
        if (test_bit(GIF_USER, &ip->i_flags)) {
                ip->i_gl->gl_object = NULL;
-                gfs2_glock_schedule_for_reclaim(ip->i_gl);
                gfs2_glock_put(ip->i_gl);
                ip->i_gl = NULL;
                if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +549,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",debug");
        if (args->ar_upgrade)
                seq_printf(s, ",upgrade");
-        if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
-                seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
        if (args->ar_posix_acl)
                seq_printf(s, ",acl");
        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +618,16 @@ static void gfs2_delete_inode(struct inode *inode)
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
        if (error)
-                goto out_uninit;
+                goto out_truncate;
        if (S_ISDIR(inode->i_mode) &&
-            (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
                error = gfs2_dir_exhash_dealloc(ip);
                if (error)
                        goto out_unlock;
        }
-        if (ip->i_di.di_eattr) {
+        if (ip->i_eattr) {
                error = gfs2_ea_dealloc(ip);
                if (error)
                        goto out_unlock;
@@ -519,6 +643,7 @@ static void gfs2_delete_inode(struct inode *inode)
        if (error)
                goto out_unlock;
+out_truncate:
        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
        if (error)
                goto out_unlock;
@@ -527,8 +652,8 @@ static void gfs2_delete_inode(struct inode *inode)
        gfs2_trans_end(sdp);
 out_unlock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
+        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-out_uninit:
+                gfs2_glock_dq(&ip->i_iopen_gh);
        gfs2_holder_uninit(&ip->i_iopen_gh);
        gfs2_glock_dq_uninit(&gh);
        if (error && error != GLR_TRYFAILED)
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_SUPER_DOT_H__
-#define __OPS_SUPER_DOT_H__
-#include <linux/fs.h>
-extern const struct super_operations gfs2_super_ops;
-#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        struct gfs2_quota_data *qd;
        int error;
-        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
+        qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
        if (!qd)
                return -ENOMEM;
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        return 0;
 fail:
-        kfree(qd);
+        kmem_cache_free(gfs2_quotad_cachep, qd);
        return error;
 }
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
                if (qd || !create) {
                        if (new_qd) {
                                gfs2_lvb_unhold(new_qd->qd_gl);
-                                kfree(new_qd);
+                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
                        }
                        *qdp = qd;
                        return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
                return;
-        if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+        if (ip->i_diskflags & GFS2_DIF_SYSTEM)
                return;
        for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-        unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
        unsigned int x, slot = 0;
        unsigned int found = 0;
        u64 dblock;
        u32 extlen = 0;
        int error;
-        if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+        if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-            ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+            ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
@@ -1195,7 +1197,7 @@ fail:
        return error;
 }
-void gfs2_quota_scan(struct gfs2_sbd *sdp)
+static void gfs2_quota_scan(struct gfs2_sbd *sdp)
 {
        struct gfs2_quota_data *qd, *safe;
        LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_lvb_unhold(qd->qd_gl);
-                kfree(qd);
+                kmem_cache_free(gfs2_quotad_cachep, qd);
        }
 }
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_lvb_unhold(qd->qd_gl);
-                kfree(qd);
+                kmem_cache_free(gfs2_quotad_cachep, qd);
                spin_lock(&sdp->sd_quota_spin);
        }
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
        }
 }
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+        if (error == 0 || error == -EROFS)
+                return;
+        if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+                               int (*fxn)(struct gfs2_sbd *sdp),
+                               unsigned long t, unsigned long *timeo,
+                               unsigned int *new_timeo)
+{
+        if (t >= *timeo) {
+                int error = fxn(sdp);
+                quotad_error(sdp, msg, error);
+                *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+        } else {
+                *timeo -= t;
+        }
+}
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip;
+        while(1) {
+                ip = NULL;
+                spin_lock(&sdp->sd_trunc_lock);
+                if (!list_empty(&sdp->sd_trunc_list)) {
+                        ip = list_entry(sdp->sd_trunc_list.next,
+                                        struct gfs2_inode, i_trunc_list);
+                        list_del_init(&ip->i_trunc_list);
+                }
+                spin_unlock(&sdp->sd_trunc_lock);
+                if (ip == NULL)
+                        return;
+                gfs2_glock_finish_truncate(ip);
+        }
+}
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_quotad(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        struct gfs2_tune *tune = &sdp->sd_tune;
+        unsigned long statfs_timeo = 0;
+        unsigned long quotad_timeo = 0;
+        unsigned long t = 0;
+        DEFINE_WAIT(wait);
+        int empty;
+        while (!kthread_should_stop()) {
+                /* Update the master statfs file */
+                quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                                   &statfs_timeo, &tune->gt_statfs_quantum);
+                /* Update quota file */
+                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+                                   &quotad_timeo, &tune->gt_quota_quantum);
+                /* FIXME: This should be turned into a shrinker */
+                gfs2_quota_scan(sdp);
+                /* Check for & recover partially truncated inodes */
+                quotad_check_trunc_list(sdp);
+                if (freezing(current))
+                        refrigerator();
+                t = min(quotad_timeo, statfs_timeo);
+                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_lock(&sdp->sd_trunc_lock);
+                empty = list_empty(&sdp->sd_trunc_list);
+                spin_unlock(&sdp->sd_trunc_lock);
+                if (empty)
+                        t -= schedule_timeout(t);
+                else
+                        t = 0;
+                finish_wait(&sdp->sd_quota_wait, &wait);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
 #define NO_QUOTA_CHANGE ((u32)-1)
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unhold(struct gfs2_inode *ip);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unlock(struct gfs2_inode *ip);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-                       u32 uid, u32 gid);
+                              u32 uid, u32 gid);
-int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
-int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_scan(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
-void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -583,13 +585,35 @@ fail:
        return error;
 }
+static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_dirty) {
+                        jd->jd_dirty = 0;
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        if (!found)
+                jd = NULL;
+        return jd;
+}
 /**
 * gfs2_check_journals - Recover any dirty journals
 * @sdp: the filesystem
 *
 */
-void gfs2_check_journals(struct gfs2_sbd *sdp)
+static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
        struct gfs2_jdesc *jd;
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
        }
 }
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_recoverd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_check_journals(sdp);
+                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+                if (freezing(current))
+                        refrigerator();
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
                *blk = 0;
 }
-int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
                           struct buffer_head **bh);
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
-int gfs2_find_jhead(struct gfs2_jdesc *jd,
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
-int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-void gfs2_check_journals(struct gfs2_sbd *sdp);
+extern int gfs2_recoverd(void *data);
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
                                                  bi->bi_len, x);
        }
-        if (count[0] != rgd->rd_rg.rg_free) {
+        if (count[0] != rgd->rd_free) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "free data mismatch:  %u != %u\n",
-                               count[0], rgd->rd_rg.rg_free);
+                               count[0], rgd->rd_free);
                return;
        }
-        tmp = rgd->rd_data -
+        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-                rgd->rd_rg.rg_free -
-                rgd->rd_rg.rg_dinodes;
        if (count[1] + count[2] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
                return;
        }
-        if (count[3] != rgd->rd_rg.rg_dinodes) {
+        if (count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_rg.rg_dinodes);
+                               count[3], rgd->rd_dinodes);
                return;
        }
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
+                if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
-        u64 rgrp_count = ip->i_di.di_size;
+        u64 rgrp_count = ip->i_disksize;
        int error;
        if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                /* Ignore partials */
                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    ip->i_di.di_size)
+                    ip->i_disksize)
                        break;
                error = read_rindex_entry(ip, &ra_state);
                if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
        const struct gfs2_rgrp *str = buf;
-        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
        u32 rg_flags;
        rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
                rgd->rd_flags |= GFS2_RDF_NOALLOC;
        else
                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
-        rg->rg_free = be32_to_cpu(str->rg_free);
+        rgd->rd_free = be32_to_cpu(str->rg_free);
-        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+        rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
-        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+        rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
-        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
        u32 rg_flags = 0;
        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
                rg_flags |= GFS2_RGF_NOALLOC;
        str->rg_flags = cpu_to_be32(rg_flags);
-        str->rg_free = cpu_to_be32(rg->rg_free);
+        str->rg_free = cpu_to_be32(rgd->rd_free);
-        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+        str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
        str->__pad = cpu_to_be32(0);
-        str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+        str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
        memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        }
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_free_clone = rgd->rd_free;
        rgd->rd_bh_count++;
        spin_unlock(&sdp->sd_rindex_spin);
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
        }
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_free_clone = rgd->rd_free;
        spin_unlock(&sdp->sd_rindex_spin);
 }
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        block = rgd->rd_data0 + blk;
        ip->i_goal = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
+        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
-        rgd->rd_rg.rg_free -= *n;
+        rgd->rd_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        block = rgd->rd_data0 + blk;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        gfs2_assert_withdraw(sdp, rgd->rd_free);
-        rgd->rd_rg.rg_free--;
+        rgd->rd_free--;
-        rgd->rd_rg.rg_dinodes++;
+        rgd->rd_dinodes++;
-        *generation = rgd->rd_rg.rg_igeneration++;
+        *generation = rgd->rd_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        if (!rgd)
                return;
-        rgd->rd_rg.rg_free += blen;
+        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        if (!rgd)
                return;
-        rgd->rd_rg.rg_free += blen;
+        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
                return;
        gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
-        if (!rgd->rd_rg.rg_dinodes)
+        if (!rgd->rd_dinodes)
                gfs2_consist_rgrpd(rgd);
-        rgd->rd_rg.rg_dinodes--;
+        rgd->rd_dinodes--;
-        rgd->rd_rg.rg_free++;
+        rgd->rd_free++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
 #include "util.h"
 /**
- * gfs2_jindex_hold - Grab a lock on the jindex
- * @sdp: The GFS2 superblock
- * @ji_gh: the holder for the jindex glock
- *
- * This is very similar to the gfs2_rindex_hold() function, except that
- * in general we hold the jindex lock for longer periods of time and
- * we grab it far less frequently (in general) then the rgrp lock.
- *
- * Returns: errno
- */
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
-{
-        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
-        struct qstr name;
-        char buf[20];
-        struct gfs2_jdesc *jd;
-        int error;
-        name.name = buf;
-        mutex_lock(&sdp->sd_jindex_mutex);
-        for (;;) {
-                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
-                if (error)
-                        break;
-                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
-                name.hash = gfs2_disk_hash(name.name, name.len);
-                error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
-                if (error == -ENOENT) {
-                        error = 0;
-                        break;
-                }
-                gfs2_glock_dq_uninit(ji_gh);
-                if (error)
-                        break;
-                error = -ENOMEM;
-                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
-                if (!jd)
-                        break;
-                INIT_LIST_HEAD(&jd->extent_list);
-                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
-                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
-                        if (!jd->jd_inode)
-                                error = -ENOENT;
-                        else
-                                error = PTR_ERR(jd->jd_inode);
-                        kfree(jd);
-                        break;
-                }
-                spin_lock(&sdp->sd_jindex_spin);
-                jd->jd_jid = sdp->sd_journals++;
-                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
-                spin_unlock(&sdp->sd_jindex_spin);
-        }
-        mutex_unlock(&sdp->sd_jindex_mutex);
-        return error;
-}
-/**
 * gfs2_jindex_free - Clear all the journal index information
 * @sdp: The GFS2 superblock
 *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
        return jd;
 }
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-        struct gfs2_jdesc *jd;
-        spin_lock(&sdp->sd_jindex_spin);
-        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
-        if (jd)
-                jd->jd_dirty = 1;
-        spin_unlock(&sdp->sd_jindex_spin);
-}
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-        struct gfs2_jdesc *jd;
-        int found = 0;
-        spin_lock(&sdp->sd_jindex_spin);
-        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-                if (jd->jd_dirty) {
-                        jd->jd_dirty = 0;
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_jindex_spin);
-        if (!found)
-                jd = NULL;
-        return jd;
-}
 int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        int ar;
        int error;
-        if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+        if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-            (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+            (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
-        jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
-        error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+        error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
        if (!error && ar) {
                gfs2_consist_inode(ip);
                error = -EIO;
@@ -423,137 +320,6 @@ out:
        return error;
 }
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        spin_lock(&sdp->sd_statfs_spin);
-        *sc = *m_sc;
-        sc->sc_total += l_sc->sc_total;
-        sc->sc_free += l_sc->sc_free;
-        sc->sc_dinodes += l_sc->sc_dinodes;
-        spin_unlock(&sdp->sd_statfs_spin);
-        if (sc->sc_free < 0)
-                sc->sc_free = 0;
-        if (sc->sc_free > sc->sc_total)
-                sc->sc_free = sc->sc_total;
-        if (sc->sc_dinodes < 0)
-                sc->sc_dinodes = 0;
-        return 0;
-}
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-                            struct gfs2_statfs_change_host *sc)
-{
-        gfs2_rgrp_verify(rgd);
-        sc->sc_total += rgd->rd_data;
-        sc->sc_free += rgd->rd_rg.rg_free;
-        sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
-        return 0;
-}
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_holder ri_gh;
-        struct gfs2_rgrpd *rgd_next;
-        struct gfs2_holder *gha, *gh;
-        unsigned int slots = 64;
-        unsigned int x;
-        int done;
-        int error = 0, err;
-        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-        if (!gha)
-                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
-        rgd_next = gfs2_rgrpd_get_first(sdp);
-        for (;;) {
-                done = 1;
-                for (x = 0; x < slots; x++) {
-                        gh = gha + x;
-                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
-                                err = gfs2_glock_wait(gh);
-                                if (err) {
-                                        gfs2_holder_uninit(gh);
-                                        error = err;
-                                } else {
-                                        if (!error)
-                                                error = statfs_slow_fill(
-                                                        gh->gh_gl->gl_object, sc);
-                                        gfs2_glock_dq_uninit(gh);
-                                }
-                        }
-                        if (gh->gh_gl)
-                                done = 0;
-                        else if (rgd_next && !error) {
-                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
-                                                           LM_ST_SHARED,
-                                                           GL_ASYNC,
-                                                           gh);
-                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
-                                done = 0;
-                        }
-                        if (signal_pending(current))
-                                error = -ERESTARTSYS;
-                }
-                if (done)
-                        break;
-                yield();
-        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
-        kfree(gha);
-        return error;
-}
 struct lfcc {
        struct list_head list;
        struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
        struct gfs2_log_header_host lh;
        int error;
-        error = gfs2_jindex_hold(sdp, &ji_gh);
-        if (error)
-                return error;
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
                if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
 #ifndef __SUPER_DOT_H__
 #define __SUPER_DOT_H__
+#include <linux/fs.h>
+#include <linux/dcache.h>
 #include "incore.h"
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
        return x;
 }
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
                        s64 total, s64 free, s64 dinodes);
 int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern struct dentry_operations gfs2_dops;
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
 #include "quota.h"
 #include "util.h"
-char *gfs2_sys_margs;
-spinlock_t gfs2_sys_margs_lock;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching,    "%d\n");
 ARGS_ATTR(localflocks,     "%d\n");
 ARGS_ATTR(debug,           "%d\n");
 ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(num_glockd,      "%u\n");
 ARGS_ATTR(posix_acl,       "%d\n");
 ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
        &args_attr_localflocks.attr,
        &args_attr_debug.attr,
        &args_attr_upgrade.attr,
-        &args_attr_num_glockd.attr,
        &args_attr_posix_acl.attr,
        &args_attr_quota.attr,
        &args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
 };
 /*
- * display counters from superblock
- */
-struct counters_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define COUNTERS_ATTR(name, fmt)                                            \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt,                                \
-                        (unsigned int)atomic_read(&sdp->sd_##name));        \
-}                                                                           \
-static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-COUNTERS_ATTR(reclaimed,        "%u\n");
-static struct attribute *counters_attrs[] = {
-        &counters_attr_reclaimed.attr,
-        NULL,
-};
-/*
 * get and set struct gfs2_tune fields
 */
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
 TUNE_ATTR_DAEMON(logd_secs, logd_process);
-TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_demote_secs.attr,
        &tune_attr_incore_log_blocks.attr,
        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_statfs_quantum.attr,
        &tune_attr_recoverd_secs.attr,
        &tune_attr_logd_secs.attr,
-        &tune_attr_quotad_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
        NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
        .attrs = lockstruct_attrs,
 };
-static struct attribute_group counters_group = {
-        .name = "counters",
-        .attrs = counters_attrs,
-};
 static struct attribute_group args_group = {
        .name = "args",
        .attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail_reg;
-        error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
-        if (error)
-                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
        if (error)
-                goto fail_counters;
+                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
        if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 fail_args:
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_counters:
-        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 fail_lockstruct:
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
        kobject_put(&sdp->sd_kobj);
 }
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+                       struct kobj_uevent_env *env)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+        return 0;
+}
+static struct kset_uevent_ops gfs2_uevent_ops = {
+        .uevent = gfs2_uevent,
+};
 int gfs2_sys_init(void)
 {
-        gfs2_sys_margs = NULL;
+        gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
-        spin_lock_init(&gfs2_sys_margs_lock);
-        gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
        if (!gfs2_kset)
                return -ENOMEM;
        return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
 void gfs2_sys_uninit(void)
 {
-        kfree(gfs2_sys_margs);
        kset_unregister(gfs2_kset);
 }
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
 #include <linux/spinlock.h>
 struct gfs2_sbd;
-/* Allow args to be passed to GFS2 when using an initial ram disk */
-extern char *gfs2_sys_margs;
-extern spinlock_t gfs2_sys_margs_lock;
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
                                           unsigned int *p)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
        for (;;) {
                struct page *page;
                unsigned long nr, ret;
+                int ra;
                /* nr is the maximum number of bytes to copy from this page */
                nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                         */
                        ret = len < nr ? len : nr;
                        if (clear_user(buf, ret))
-                                ret = -EFAULT;
+                                ra = -EFAULT;
+                        else
+                                ra = 0;
                } else {
                        /*
                         * We have the page, copy it to user space buffer.
                         */
-                        ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+                        ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+                        ret = ra;
                }
-                if (ret < 0) {
+                if (ra < 0) {
                        if (retval == 0)
-                                retval = ret;
+                                retval = ra;
                        if (page)
                                page_cache_release(page);
                        goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 7de1cda92489..0013ac1af8e7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
 #include <linux/mount.h>
+#include <linux/async.h>
 /*
 * This is needed for the following functions:
@@ -110,8 +111,8 @@ static void wake_up_inode(struct inode *inode)
 /**
 * inode_init_always - perform inode structure intialisation
- * @sb          - superblock inode belongs to.
+ * @sb: superblock inode belongs to
- * @inode       - inode to initialise
+ * @inode: inode to initialise
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
@@ -131,6 +132,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_op = &empty_iops;
        inode->i_fop = &empty_fops;
        inode->i_nlink = 1;
+        inode->i_uid = 0;
+        inode->i_gid = 0;
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_blocks = 0;
@@ -164,7 +167,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
-        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->assoc_mapping = NULL;
        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
@@ -574,8 +577,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 /**
 * inode_add_to_lists - add a new inode to relevant lists
- * @sb          - superblock inode belongs to.
+ * @sb: superblock inode belongs to
- * @inode       - inode to mark in use
+ * @inode: inode to mark in use
 *
 * When an inode is allocated it needs to be accounted for, added to the in use
 * list, the owning superblock and the inode hash. This needs to be done under
@@ -599,7 +602,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
 *      @sb: superblock
 *
 *      Allocates a new inode for given superblock. The default gfp_mask
- *      for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *      for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *      If HIGHMEM pages are unsuitable or it is known that pages allocated
 *      for the page cache are not reclaimable or migratable,
 *      mapping_set_gfp_mask() must be called with suitable flags on the
@@ -1136,16 +1139,11 @@ EXPORT_SYMBOL(remove_inode_hash);
 * I_FREEING is set so that no-one will take a new reference to the inode while
 * it is being deleted.
 */
-void generic_delete_inode(struct inode *inode)
+static void generic_delete_inode_async(void *data, async_cookie_t cookie)
 {
+        struct inode *inode = data;
        const struct super_operations *op = inode->i_sb->s_op;
-        list_del_init(&inode->i_list);
-        list_del_init(&inode->i_sb_list);
-        inode->i_state |= I_FREEING;
-        inodes_stat.nr_inodes--;
-        spin_unlock(&inode_lock);
        security_inode_delete(inode);
        if (op->delete_inode) {
@@ -1169,6 +1167,16 @@ void generic_delete_inode(struct inode *inode)
        destroy_inode(inode);
 }
+void generic_delete_inode(struct inode *inode)
+{
+        list_del_init(&inode->i_list);
+        list_del_init(&inode->i_sb_list);
+        inode->i_state |= I_FREEING;
+        inodes_stat.nr_inodes--;
+        spin_unlock(&inode_lock);
+        async_schedule_special(generic_delete_inode_async, inode, &inode->i_sb->s_async_list);
+}
 EXPORT_SYMBOL(generic_delete_inode);
 static void generic_forget_inode(struct inode *inode)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..cc3f1aa1cf7b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
 #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
-/*
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
 * @inode - the inode to map
 * @arg - the pointer to userspace where we copy everything to
 * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 *
 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
 * please do not use this function, it will stop at the first unmapped block
- * beyond i_size
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
 */
-int generic_block_fiemap(struct inode *inode,
-                         struct fiemap_extent_info *fieinfo, u64 start,
+int __generic_block_fiemap(struct inode *inode,
-                         u64 len, get_block_t *get_block)
+                           struct fiemap_extent_info *fieinfo, u64 start,
+                           u64 len, get_block_t *get_block)
 {
        struct buffer_head tmp;
        unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
        start_blk = logical_to_blk(inode, start);
-        /* guard against change */
-        mutex_lock(&inode->i_mutex);
        length = (long long)min_t(u64, len, i_size_read(inode));
        map_len = length;
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
                cond_resched();
        } while (1);
-        mutex_unlock(&inode->i_mutex);
        /* if ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;
        return ret;
 }
+EXPORT_SYMBOL(__generic_block_fiemap);
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+int generic_block_fiemap(struct inode *inode,
+                         struct fiemap_extent_info *fieinfo, u64 start,
+                         u64 len, get_block_t *get_block)
+{
+        int ret;
+        mutex_lock(&inode->i_mutex);
+        ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 EXPORT_SYMBOL(generic_block_fiemap);
 #endif  /*  CONFIG_BLOCK  */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
        }
        sbi->s_joliet_level = joliet_level;
-        /* check the root inode */
-        if (!inode->i_op)
-                goto out_bad_root;
        /* Make sure the root inode is a directory */
        if (!S_ISDIR(inode->i_mode)) {
                printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
        /*
         * Display error messages and free resources.
         */
-out_bad_root:
-        printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
 out_iput:
        iput(inode);
        goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+        /*
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time*3 +
+                                journal->j_average_commit_time) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
 *
 * Returns an error code or 0 on success.
 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
         * on IO anyway.  Speeds up many-threaded, many-dir operations
         * by 30x or more...
         *
+         * We try and optimize the sleep time against what the underlying disk
+         * can do, instead of having a static sleep time.  This is usefull for
+         * the case where our storage is so fast that it is more optimal to go
+         * ahead and force a flush and wait for the transaction to be committed
+         * than it is to wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We acheive this by measuring how long it takes
+         * to commit a transaction, and compare it with how long this
+         * transaction has been running, and if run time < commit time then we
+         * sleep for the delta and commit.  This greatly helps super fast disks
+         * that would see slowdowns as more threads started doing fsyncs.
+         *
         * But don't do this if this process was the most recent one to
         * perform a synchronous write.  We do this to detect the case where a
         * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = min_t(u64, commit_time,
+                                    1000*jiffies_to_usecs(1));
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 073124a29b8c..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -535,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                if (is_journal_aborted(journal)) {
                        clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
+                        jbd2_buffer_abort_trigger(jh,
+                                                  jh->b_frozen_data ?
+                                                  jh->b_frozen_triggers :
+                                                  jh->b_triggers);
                        jbd2_journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
                         * any descriptor buffers which may have been
@@ -870,6 +874,9 @@ restart_loop:
                 * data.
                 *
                 * Otherwise, we can just throw away the frozen data now.
+                 *
+                 * We also know that the frozen data has already fired
+                 * its triggers if they exist, so we can clear that too.
                 */
                if (jh->b_committed_data) {
                        jbd2_free(jh->b_committed_data, bh->b_size);
@@ -877,10 +884,12 @@ restart_loop:
                        if (jh->b_frozen_data) {
                                jh->b_committed_data = jh->b_frozen_data;
                                jh->b_frozen_data = NULL;
+                                jh->b_frozen_triggers = NULL;
                        }
                } else if (jh->b_frozen_data) {
                        jbd2_free(jh->b_frozen_data, bh->b_size);
                        jh->b_frozen_data = NULL;
+                        jh->b_frozen_triggers = NULL;
                }
                spin_lock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2932c8f55199..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -291,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        struct jbd2_buffer_trigger_type *triggers;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -315,13 +317,23 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
+                triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+                triggers = jh_in->b_triggers;
        }
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
+         * Fire any commit trigger.  Do this before checking for escaping,
+         * as the trigger may modify the magic offset.  If a copy-out
+         * happens afterwards, it will have the correct data in the buffer.
+         */
+        jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+                                   triggers);
+        /*
         * Check for escaping
         */
        if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -353,6 +365,13 @@ repeat:
                new_page = virt_to_page(tmp);
                new_offset = offset_in_page(tmp);
                done_copy_out = 1;
+                /*
+                 * This isn't strictly necessary, as we're using frozen
+                 * data for the escaping, but it keeps consistency with
+                 * b_frozen_data usage.
+                 */
+                jh_in->b_frozen_triggers = jh_in->b_triggers;
        }
        /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 48c21bac5a56..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -743,6 +743,12 @@ done:
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
+                /*
+                 * Now that the frozen data is saved off, we need to store
+                 * any matching triggers.
+                 */
+                jh->b_frozen_triggers = jh->b_triggers;
        }
        jbd_unlock_bh_state(bh);
@@ -946,6 +952,47 @@ out:
 }
 /**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+                               struct jbd2_buffer_trigger_type *type)
+{
+        struct journal_head *jh = bh2jh(bh);
+        jh->b_triggers = type;
+}
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+                                struct jbd2_buffer_trigger_type *triggers)
+{
+        struct buffer_head *bh = jh2bh(jh);
+        if (!triggers || !triggers->t_commit)
+                return;
+        triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+                               struct jbd2_buffer_trigger_type *triggers)
+{
+        if (!triggers || !triggers->t_abort)
+                return;
+        triggers->t_abort(triggers, jh2bh(jh));
+}
+/**
 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
 /*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
 */
-static HLIST_HEAD(aggregate_hash);
 /*
 * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
        /* release the page */
        release_metapage(mp);
-        hlist_add_head(&ip->i_hash, &aggregate_hash);
+        /*
+         * that will look hashed, but won't be on any list; hlist_del()
+         * will work fine and require no locking.
+         */
+        ip->i_hash.pprev = &ip->i_hash.next;
        return (ip);
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index bdaec17fa388..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-        root->i_uid = root->i_gid = 0;
        root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
        dentry = d_alloc(NULL, &d_name);
        if (!dentry) {
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
-        inode->i_uid = inode->i_gid = 0;
-        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
                if (!inode)
                        goto out;
                inode->i_mode = S_IFREG | files->mode;
-                inode->i_uid = inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_fop = files->ops;
                inode->i_ino = i;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT      (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        unsigned char fl_type;
        int status = -ENOLCK;
-        if (nsm_monitor(host) < 0) {
+        if (nsm_monitor(host) < 0)
-                printk(KERN_NOTICE "lockd: failed to monitor %s\n",
-                                        host->h_name);
                goto out;
-        }
        fl->fl_flags |= FL_ACCESS;
        status = do_vfs_lock(fl);
        fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b820..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 #include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
-static struct nsm_handle        *nsm_find(const struct sockaddr *sap,
-                                                const size_t salen,
-                                                const char *hostname,
-                                                const size_t hostname_len,
-                                                const int create);
 struct nlm_lookup_host_info {
        const int               server;         /* search for server|client */
@@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
        }
 }
-static void nlm_display_address(const struct sockaddr *sap,
-                                char *buf, const size_t len)
-{
-        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-        switch (sap->sa_family) {
-        case AF_UNSPEC:
-                snprintf(buf, len, "unspecified");
-                break;
-        case AF_INET:
-                snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-                break;
-        case AF_INET6:
-                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-                        snprintf(buf, len, "%pI4",
-                                 &sin6->sin6_addr.s6_addr32[3]);
-                else
-                        snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-                break;
-        default:
-                snprintf(buf, len, "unsupported address family");
-                break;
-        }
-}
 /*
 * Common host lookup routine for server & client
 */
@@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
-                nsm = nsm_find(ni->sap, ni->salen,
+                nsm = nsm_get_handle(ni->sap, ni->salen,
-                                ni->hostname, ni->hostname_len, 1);
+                                        ni->hostname, ni->hostname_len);
                if (!nsm) {
                        dprintk("lockd: nlm_lookup_host failed; "
                                "no nsm handle\n");
@@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                goto out;
        }
        host->h_name       = nsm->sm_name;
+        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
        host->h_addrlen = ni->salen;
        nlm_clear_port(nlm_addr(host));
@@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        nrhosts++;
-        nlm_display_address((struct sockaddr *)&host->h_addr,
-                                host->h_addrbuf, sizeof(host->h_addrbuf));
-        nlm_display_address((struct sockaddr *)&host->h_srcaddr,
-                                host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
        dprintk("lockd: nlm_lookup_host created host %s\n",
                        host->h_name);
@@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
-        /*
-         * Release NSM handle and unmonitor host.
-         */
        nsm_unmonitor(host);
+        nsm_release(host->h_nsmhandle);
        clnt = host->h_rpcclnt;
        if (clnt != NULL)
@@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
-        dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+        dprintk("lockd: nlm_bind_host %s (%s)\n",
-                        host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
+                        host->h_name, host->h_addrbuf);
        /* Lock host handle */
        mutex_lock(&host->h_mutex);
@@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
        }
 }
-/*
+/**
- * We were notified that the host indicated by address &sin
+ * nlm_host_rebooted - Release all resources held by rebooted host
- * has rebooted.
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
- * Release all resources held by that peer.
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
 */
-void nlm_host_rebooted(const struct sockaddr_in *sin,
+void nlm_host_rebooted(const struct nlm_reboot *info)
-                                const char *hostname,
-                                unsigned int hostname_len,
-                                u32 new_state)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
-        nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+        nsm = nsm_reboot_lookup(info);
-                        hostname, hostname_len, 0);
+        if (unlikely(nsm == NULL))
-        if (nsm == NULL) {
-                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-                                hostname_len, hostname);
                return;
-        }
-        dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
-                        hostname_len, hostname, nsm->sm_addrbuf);
-        /* When reclaiming locks on this peer, make sure that
-         * we set up a new notification */
-        nsm->sm_monitored = 0;
        /* Mark all hosts tied to this NSM state as having rebooted.
         * We run the loop repeatedly, because we drop the host table
@@ -520,8 +470,8 @@ again:	mutex_lock(&nlm_host_mutex);
        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
                hlist_for_each_entry(host, pos, chain, h_hash) {
                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != new_state) {
+                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = new_state;
+                                host->h_nsmstate = info->state;
                                host->h_state++;
                                nlm_get_host(host);
@@ -629,89 +579,3 @@ nlm_gc_hosts(void)
        next_gc = jiffies + NLM_HOST_COLLECT;
 }
-/*
- * Manage NSM handles
- */
-static LIST_HEAD(nsm_handles);
-static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
-                                   const size_t salen,
-                                   const char *hostname,
-                                   const size_t hostname_len,
-                                   const int create)
-{
-        struct nsm_handle *nsm = NULL;
-        struct nsm_handle *pos;
-        if (!sap)
-                return NULL;
-        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
-                if (printk_ratelimit()) {
-                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
-                                            "in NFS lock request\n",
-                                (int)hostname_len, hostname);
-                }
-                return NULL;
-        }
-retry:
-        spin_lock(&nsm_lock);
-        list_for_each_entry(pos, &nsm_handles, sm_link) {
-                if (hostname && nsm_use_hostnames) {
-                        if (strlen(pos->sm_name) != hostname_len
-                         || memcmp(pos->sm_name, hostname, hostname_len))
-                                continue;
-                } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
-                        continue;
-                atomic_inc(&pos->sm_count);
-                kfree(nsm);
-                nsm = pos;
-                goto found;
-        }
-        if (nsm) {
-                list_add(&nsm->sm_link, &nsm_handles);
-                goto found;
-        }
-        spin_unlock(&nsm_lock);
-        if (!create)
-                return NULL;
-        nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
-        if (nsm == NULL)
-                return NULL;
-        memcpy(nsm_addr(nsm), sap, salen);
-        nsm->sm_addrlen = salen;
-        nsm->sm_name = (char *) (nsm + 1);
-        memcpy(nsm->sm_name, hostname, hostname_len);
-        nsm->sm_name[hostname_len] = '\0';
-        nlm_display_address((struct sockaddr *)&nsm->sm_addr,
-                                nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
-        atomic_set(&nsm->sm_count, 1);
-        goto retry;
-found:
-        spin_unlock(&nsm_lock);
-        return nsm;
-}
-/*
- * Release an NSM handle
- */
-void
-nsm_release(struct nsm_handle *nsm)
-{
-        if (!nsm)
-                return;
-        if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
-                list_del(&nsm->sm_link);
-                spin_unlock(&nsm_lock);
-                kfree(nsm);
-        }
-}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
 #include <linux/types.h>
 #include <linux/utsname.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_MONITOR
+#define NSM_PROGRAM             100024
+#define NSM_VERSION             1
+enum {
+        NSMPROC_NULL,
+        NSMPROC_STAT,
+        NSMPROC_MON,
+        NSMPROC_UNMON,
+        NSMPROC_UNMON_ALL,
+        NSMPROC_SIMU_CRASH,
+        NSMPROC_NOTIFY,
+};
+struct nsm_args {
+        struct nsm_private      *priv;
+        u32                     prog;           /* RPC callback info */
+        u32                     vers;
+        u32                     proc;
-#define XDR_ADDRBUF_LEN         (20)
+        char                    *mon_name;
+};
-static struct rpc_clnt *        nsm_create(void);
+struct nsm_res {
+        u32                     status;
+        u32                     state;
+};
 static struct rpc_program       nsm_program;
+static                          LIST_HEAD(nsm_handles);
+static                          DEFINE_SPINLOCK(nsm_lock);
 /*
 * Local NSM state
 */
-int                             nsm_local_state;
+int     __read_mostly           nsm_local_state;
+int     __read_mostly           nsm_use_hostnames;
-/*
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
- * Common procedure for SM_MON/SM_UNMON calls
+{
- */
+        return (struct sockaddr *)&nsm->sm_addr;
-static int
+}
-nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+                                     const size_t len)
+{
+        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+                                     const size_t len)
+{
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+                snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+        else if (sin6->sin6_scope_id != 0)
+                snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+                                sin6->sin6_scope_id);
+        else
+                snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+static void nsm_display_address(const struct sockaddr *sap,
+                                char *buf, const size_t len)
+{
+        switch (sap->sa_family) {
+        case AF_INET:
+                nsm_display_ipv4_address(sap, buf, len);
+                break;
+        case AF_INET6:
+                nsm_display_ipv6_address(sap, buf, len);
+                break;
+        default:
+                snprintf(buf, len, "unsupported address family");
+                break;
+        }
+}
+static struct rpc_clnt *nsm_create(void)
+{
+        struct sockaddr_in sin = {
+                .sin_family             = AF_INET,
+                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
+        };
+        struct rpc_create_args args = {
+                .protocol               = XPRT_TRANSPORT_UDP,
+                .address                = (struct sockaddr *)&sin,
+                .addrsize               = sizeof(sin),
+                .servername             = "rpc.statd",
+                .program                = &nsm_program,
+                .version                = NSM_VERSION,
+                .authflavor             = RPC_AUTH_NULL,
+        };
+        return rpc_create(&args);
+}
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
        struct rpc_clnt *clnt;
        int             status;
-        struct nsm_args args;
+        struct nsm_args args = {
+                .priv           = &nsm->sm_priv,
+                .prog           = NLM_PROGRAM,
+                .vers           = 3,
+                .proc           = NLMPROC_NSM_NOTIFY,
+                .mon_name       = nsm->sm_mon_name,
+        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
                .rpc_resp       = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        clnt = nsm_create();
        if (IS_ERR(clnt)) {
                status = PTR_ERR(clnt);
+                dprintk("lockd: failed to create NSM upcall transport, "
+                                "status=%d\n", status);
                goto out;
        }
-        memset(&args, 0, sizeof(args));
-        args.mon_name = nsm->sm_name;
-        args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
-        args.prog = NLM_PROGRAM;
-        args.vers = 3;
-        args.proc = NLMPROC_NSM_NOTIFY;
        memset(res, 0, sizeof(*res));
        msg.rpc_proc = &clnt->cl_procinfo[proc];
        status = rpc_call_sync(clnt, &msg, 0);
        if (status < 0)
-                printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
+                dprintk("lockd: NSM upcall RPC failed, status=%d\n",
-                        status);
+                                status);
        else
                status = 0;
        rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        return status;
 }
-/*
+/**
- * Set up monitoring of a remote host
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
 */
-int
+int nsm_monitor(const struct nlm_host *host)
-nsm_monitor(struct nlm_host *host)
 {
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
        int             status;
-        dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+        dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
-        BUG_ON(nsm == NULL);
        if (nsm->sm_monitored)
                return 0;
-        status = nsm_mon_unmon(nsm, SM_MON, &res);
+        /*
+         * Choose whether to record the caller_name or IP address of
+         * this peer in the local rpc.statd's database.
+         */
+        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        if (status < 0 || res.status != 0)
+        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-                printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
+        if (res.status != 0)
+                status = -EIO;
+        if (status < 0)
+                printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
        else
                nsm->sm_monitored = 1;
        return status;
 }
-/*
+/**
- * Cease to monitor remote host
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
 */
-int
+void nsm_unmonitor(const struct nlm_host *host)
-nsm_unmonitor(struct nlm_host *host)
 {
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
-        int             status = 0;
+        int status;
-        if (nsm == NULL)
-                return 0;
-        host->h_nsmhandle = NULL;
        if (atomic_read(&nsm->sm_count) == 1
         && nsm->sm_monitored && !nsm->sm_sticky) {
-                dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+                dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
-                status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+                if (res.status != 0)
+                        status = -EIO;
                if (status < 0)
                        printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
-                                        host->h_name);
+                                        nsm->sm_name);
                else
                        nsm->sm_monitored = 0;
        }
-        nsm_release(nsm);
+}
-        return status;
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+                                              const size_t len)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (strlen(nsm->sm_name) == len &&
+                    memcmp(nsm->sm_name, hostname, len) == 0)
+                        return nsm;
+        return NULL;
+}
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (nlm_cmp_addr(nsm_addr(nsm), sap))
+                        return nsm;
+        return NULL;
+}
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (memcmp(nsm->sm_priv.data, priv->data,
+                                        sizeof(priv->data)) == 0)
+                        return nsm;
+        return NULL;
 }
 /*
- * Create NSM client for the local host
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
 */
-static struct rpc_clnt *
+static void nsm_init_private(struct nsm_handle *nsm)
-nsm_create(void)
 {
-        struct sockaddr_in      sin = {
+        u64 *p = (u64 *)&nsm->sm_priv.data;
-                .sin_family     = AF_INET,
+        struct timespec ts;
-                .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
-                .sin_port       = 0,
-        };
-        struct rpc_create_args args = {
-                .protocol       = XPRT_TRANSPORT_UDP,
-                .address        = (struct sockaddr *)&sin,
-                .addrsize       = sizeof(sin),
-                .servername     = "localhost",
-                .program        = &nsm_program,
-                .version        = SM_VERSION,
-                .authflavor     = RPC_AUTH_NULL,
-        };
-        return rpc_create(&args);
+        ktime_get_ts(&ts);
+        *p++ = timespec_to_ns(&ts);
+        *p = (unsigned long)nsm;
+}
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+                                            const size_t salen,
+                                            const char *hostname,
+                                            const size_t hostname_len)
+{
+        struct nsm_handle *new;
+        new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+        if (unlikely(new == NULL))
+                return NULL;
+        atomic_set(&new->sm_count, 1);
+        new->sm_name = (char *)(new + 1);
+        memcpy(nsm_addr(new), sap, salen);
+        new->sm_addrlen = salen;
+        nsm_init_private(new);
+        nsm_display_address((const struct sockaddr *)&new->sm_addr,
+                                new->sm_addrbuf, sizeof(new->sm_addrbuf));
+        memcpy(new->sm_name, hostname, hostname_len);
+        new->sm_name[hostname_len] = '\0';
+        return new;
+}
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+                                  const size_t salen, const char *hostname,
+                                  const size_t hostname_len)
+{
+        struct nsm_handle *cached, *new = NULL;
+        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+                if (printk_ratelimit()) {
+                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+                                            "in NFS lock request\n",
+                                (int)hostname_len, hostname);
+                }
+                return NULL;
+        }
+retry:
+        spin_lock(&nsm_lock);
+        if (nsm_use_hostnames && hostname != NULL)
+                cached = nsm_lookup_hostname(hostname, hostname_len);
+        else
+                cached = nsm_lookup_addr(sap);
+        if (cached != NULL) {
+                atomic_inc(&cached->sm_count);
+                spin_unlock(&nsm_lock);
+                kfree(new);
+                dprintk("lockd: found nsm_handle for %s (%s), "
+                                "cnt %d\n", cached->sm_name,
+                                cached->sm_addrbuf,
+                                atomic_read(&cached->sm_count));
+                return cached;
+        }
+        if (new != NULL) {
+                list_add(&new->sm_link, &nsm_handles);
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: created nsm_handle for %s (%s)\n",
+                                new->sm_name, new->sm_addrbuf);
+                return new;
+        }
+        spin_unlock(&nsm_lock);
+        new = nsm_create_handle(sap, salen, hostname, hostname_len);
+        if (unlikely(new == NULL))
+                return NULL;
+        goto retry;
+}
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * Otherwise returns NULL if some error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+        struct nsm_handle *cached;
+        spin_lock(&nsm_lock);
+        cached = nsm_lookup_priv(&info->priv);
+        if (unlikely(cached == NULL)) {
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+                                info->len, info->mon);
+                return cached;
+        }
+        atomic_inc(&cached->sm_count);
+        spin_unlock(&nsm_lock);
+        /*
+         * During subsequent lock activity, force a fresh
+         * notification to be set up for this host.
+         */
+        cached->sm_monitored = 0;
+        dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+                        cached->sm_name, cached->sm_addrbuf,
+                        atomic_read(&cached->sm_count));
+        return cached;
+}
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+        if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+                list_del(&nsm->sm_link);
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+                                nsm->sm_name, nsm->sm_addrbuf);
+                kfree(nsm);
+        }
 }
 /*
@@ -154,127 +428,132 @@ nsm_create(void)
 * Status Monitor wire protocol.
 */
-static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
-        size_t len = strlen(string);
+        const u32 len = strlen(string);
+        __be32 *p;
-        if (len > SM_MAXSTRLEN)
-                len = SM_MAXSTRLEN;
+        if (unlikely(len > SM_MAXSTRLEN))
-        return xdr_encode_opaque(p, string, len);
+                return -EIO;
+        p = xdr_reserve_space(xdr, sizeof(u32) + len);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque(p, string, len);
+        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
- *
- * Linux uses a text version of the IP address of the remote
- * host as the host identifier (the "mon_name" argument).
- *
- * Linux statd always looks up the canonical hostname first for
- * whatever remote hostname it receives, so this works alright.
 */
-static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        char    buffer[XDR_ADDRBUF_LEN + 1];
+        return encode_nsm_string(xdr, argp->mon_name);
-        char    *name = argp->mon_name;
-        if (!nsm_use_hostnames) {
-                snprintf(buffer, XDR_ADDRBUF_LEN,
-                         "%pI4", &argp->addr);
-                name = buffer;
-        }
-        return xdr_encode_nsm_string(p, name);
 }
 /*
 * The "my_id" argument specifies the hostname and RPC procedure
 * to be called when the status manager receives notification
- * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        p = xdr_encode_nsm_string(p, utsname()->nodename);
+        int status;
-        if (!p)
+        __be32 *p;
-                return ERR_PTR(-EIO);
+        status = encode_nsm_string(xdr, utsname()->nodename);
+        if (unlikely(status != 0))
+                return status;
+        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
        *p++ = htonl(argp->prog);
        *p++ = htonl(argp->vers);
        *p++ = htonl(argp->proc);
+        return 0;
-        return p;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
- * of an SM_MON or SM_UNMON call.
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_name(p, argp);
+        int status;
-        if (!p)
-                return ERR_PTR(-EIO);
-        return xdr_encode_my_id(p, argp);
+        status = encode_mon_name(xdr, argp);
+        if (unlikely(status != 0))
+                return status;
+        return encode_my_id(xdr, argp);
 }
 /*
 * The "priv" argument may contain private information required
- * by the SM_MON call. This information will be supplied in the
+ * by the NSMPROC_MON call. This information will be supplied in the
- * SM_NOTIFY call.
+ * NLMPROC_SM_NOTIFY call.
- *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
 */
-static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        *p++ = argp->addr;
+        __be32 *p;
-        *p++ = 0;
-        *p++ = 0;
-        *p++ = 0;
-        return p;
+        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+        return 0;
 }
-static int
+static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
-xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+                       const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_id(p, argp);
+        struct xdr_stream xdr;
-        if (IS_ERR(p))
+        int status;
-                return PTR_ERR(p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        p = xdr_encode_priv(p, argp);
+        status = encode_mon_id(&xdr, argp);
-        if (IS_ERR(p))
+        if (unlikely(status))
-                return PTR_ERR(p);
+                return status;
+        return encode_priv(&xdr, argp);
-        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-        return 0;
 }
-static int
+static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
-xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+                         const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_id(p, argp);
+        struct xdr_stream xdr;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
+        return encode_mon_id(&xdr, argp);
-        return 0;
 }
-static int
+static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
-xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+                            struct nsm_res *resp)
 {
+        struct xdr_stream xdr;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
        resp->status = ntohl(*p++);
-        resp->state = ntohl(*p++);
+        resp->state = ntohl(*p);
-        dprintk("nsm: xdr_decode_stat_res status %d state %d\n",
+        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
                        resp->status, resp->state);
        return 0;
 }
-static int
+static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
-xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+                        struct nsm_res *resp)
 {
-        resp->state = ntohl(*p++);
+        struct xdr_stream xdr;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(&xdr, sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
+        resp->state = ntohl(*p);
+        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
        return 0;
 }
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_unmonres_sz  1
 static struct rpc_procinfo      nsm_procedures[] = {
-[SM_MON] = {
+[NSMPROC_MON] = {
-                .p_proc         = SM_MON,
+                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t) xdr_encode_mon,
+                .p_encode       = (kxdrproc_t)xdr_enc_mon,
-                .p_decode       = (kxdrproc_t) xdr_decode_stat_res,
+                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
-                .p_statidx      = SM_MON,
+                .p_statidx      = NSMPROC_MON,
                .p_name         = "MONITOR",
        },
-[SM_UNMON] = {
+[NSMPROC_UNMON] = {
-                .p_proc         = SM_UNMON,
+                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t) xdr_encode_unmon,
+                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t) xdr_decode_stat,
+                .p_decode       = (kxdrproc_t)xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
-                .p_statidx      = SM_UNMON,
+                .p_statidx      = NSMPROC_UNMON,
                .p_name         = "UNMONITOR",
        },
 };
@@ -322,7 +601,7 @@ static struct rpc_stat		nsm_stats;
 static struct rpc_program       nsm_program = {
                .name           = "statd",
-                .number         = SM_PROGRAM,
+                .number         = NSM_PROGRAM,
                .nrvers         = ARRAY_SIZE(nsm_version),
                .version        = nsm_version,
                .stats          = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d02..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 #define NLMDBG_FACILITY         NLMDBG_SVC
@@ -54,13 +53,26 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long                   nlmsvc_timeout;
 /*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+        defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t        nlmsvc_family = AF_INET6;
+#else   /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t        nlmsvc_family = AF_INET;
+#endif  /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+/*
 * These can be set at insmod time (useful for NFS as root filesystem),
 * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
 */
 static unsigned long            nlm_grace_period;
 static unsigned long            nlm_timeout = LOCKD_DFLT_TIMEO;
 static int                      nlm_udpport, nlm_tcpport;
-int                             nsm_use_hostnames = 0;
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int             nlm_max_connections = 1024;
 /*
 * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
                long timeout = MAX_SCHEDULE_TIMEOUT;
                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+                /* update sv_maxconn if it has changed */
+                rqstp->rq_server->sv_maxconn = nlm_max_connections;
                if (signalled()) {
                        flush_signals(current);
                        if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
        return 0;
 }
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+                                 unsigned short port)
+{
+        struct svc_xprt *xprt;
+        xprt = svc_find_xprt(serv, name, 0, 0);
+        if (xprt == NULL)
+                return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+        svc_xprt_put(xprt);
+        return 0;
+}
 /*
 * Ensure there are active UDP and TCP listeners for lockd.
 *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
 static int make_socks(struct svc_serv *serv)
 {
        static int warned;
-        struct svc_xprt *xprt;
+        int err;
-        int err = 0;
-        xprt = svc_find_xprt(serv, "udp", 0, 0);
+        err = create_lockd_listener(serv, "udp", nlm_udpport);
-        if (!xprt)
+        if (err < 0)
-                err = svc_create_xprt(serv, "udp", nlm_udpport,
+                goto out_err;
-                                      SVC_SOCK_DEFAULTS);
-        else
+        err = create_lockd_listener(serv, "tcp", nlm_tcpport);
-                svc_xprt_put(xprt);
+        if (err < 0)
-        if (err >= 0) {
+                goto out_err;
-                xprt = svc_find_xprt(serv, "tcp", 0, 0);
-                if (!xprt)
+        warned = 0;
-                        err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+        return 0;
-                                              SVC_SOCK_DEFAULTS);
-                else
+out_err:
-                        svc_xprt_put(xprt);
+        if (warned++ == 0)
-        }
-        if (err >= 0) {
-                warned = 0;
-                err = 0;
-        } else if (warned++ == 0)
                printk(KERN_WARNING
-                       "lockd_up: makesock failed, error=%d\n", err);
+                        "lockd_up: makesock failed, error=%d\n", err);
        return err;
 }
@@ -252,7 +274,7 @@ int lockd_up(void)
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
        error = -ENOMEM;
-        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
+        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
                goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
        }
        svc_sock_update_bufs(serv);
+        serv->sv_maxconn = nlm_max_connections;
        nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
        if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
                  &nlm_tcpport, 0644);
 module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
 /*
 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -419,8 +417,6 @@ static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
-        struct sockaddr_in      saddr;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                return rpc_system_err;
        }
-        /* Obtain the host pointer for this NFS server and try to
+        nlm_host_rebooted(argp);
-         * reclaim all locks we hold on this server.
-         */
-        memset(&saddr, 0, sizeof(saddr));
-        saddr.sin_family = AF_INET;
-        saddr.sin_addr.s_addr = argp->addr;
-        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
        return rpc_success;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -451,8 +449,6 @@ static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
-        struct sockaddr_in      saddr;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                return rpc_system_err;
        }
-        /* Obtain the host pointer for this NFS server and try to
+        nlm_host_rebooted(argp);
-         * reclaim all locks we hold on this server.
-         */
-        memset(&saddr, 0, sizeof(saddr));
-        saddr.sin_family = AF_INET;
-        saddr.sin_addr.s_addr = argp->addr;
-        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
        return rpc_success;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/module.h>
 #include <linux/mount.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_XDR
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
        argp->state = ntohl(*p++);
-        /* Preserve the address in network byte order */
+        memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
-        argp->addr = *p++;
+        p += XDR_QUADLEN(SM_PRIV_SIZE);
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_XDR
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
        argp->state = ntohl(*p++);
-        /* Preserve the address in network byte order */
+        memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
-        argp->addr  = *p++;
+        p += XDR_QUADLEN(SM_PRIV_SIZE);
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
        return -EINVAL;
 got_it:
-        pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page);
+        pos = page_offset(page) + p - (char *)page_address(page);
        err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                                first_hole = page_block;
                        page_block++;
                        block_in_file++;
-                        clear_buffer_mapped(map_bh);
                        continue;
                }
@@ -308,7 +307,10 @@ alloc_new:
                goto alloc_new;
        }
-        if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
+        relative_block = block_in_file - *first_logical_block;
+        nblocks = map_bh->b_size >> blkbits;
+        if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+            (first_hole != blocks_per_page))
                bio = mpage_bio_submit(READ, bio);
        else
                *last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index df2d3df4f049..f05bed242422 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask)
                        return -EACCES;
        }
-        if (inode->i_op && inode->i_op->permission)
+        if (inode->i_op->permission)
                retval = inode->i_op->permission(inode, mask);
        else
                retval = generic_permission(inode, mask, NULL);
@@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
 {
        umode_t mode = inode->i_mode;
-        if (inode->i_op && inode->i_op->permission)
+        if (inode->i_op->permission)
                return -EAGAIN;
        if (current_fsuid() == inode->i_uid)
@@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
-                err = -ENOTDIR; 
-                if (!inode->i_op)
-                        goto out_dput;
                if (inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
@@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        inode = nd->path.dentry->d_inode;
                        if (!inode)
                                break;
-                        err = -ENOTDIR; 
-                        if (!inode->i_op)
-                                break;
                } else
                        path_to_nameidata(&next, nd);
                err = -ENOTDIR; 
@@ -961,7 +955,7 @@ last_component:
                        break;
                inode = next.dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
-                    && inode && inode->i_op && inode->i_op->follow_link) {
+                    && inode && inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
@@ -973,7 +967,7 @@ last_component:
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op || !inode->i_op->lookup)
+                        if (!inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->create)
+        if (!dir->i_op->create)
                return -EACCES; /* shouldn't it be ENOSYS? */
        mode &= S_IALLUGO;
        mode |= S_IFREG;
@@ -1752,7 +1746,7 @@ do_last:
        error = -ENOENT;
        if (!path.dentry->d_inode)
                goto exit_dput;
-        if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+        if (path.dentry->d_inode->i_op->follow_link)
                goto do_link;
        path_to_nameidata(&path, &nd);
@@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
                return -EPERM;
-        if (!dir->i_op || !dir->i_op->mknod)
+        if (!dir->i_op->mknod)
                return -EPERM;
        error = devcgroup_inode_mknod(mode, dev);
@@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->mkdir)
+        if (!dir->i_op->mkdir)
                return -EPERM;
        mode &= (S_IRWXUGO|S_ISVTX);
@@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->rmdir)
+        if (!dir->i_op->rmdir)
                return -EPERM;
        DQUOT_INIT(dir);
@@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->unlink)
+        if (!dir->i_op->unlink)
                return -EPERM;
        DQUOT_INIT(dir);
@@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->symlink)
+        if (!dir->i_op->symlink)
                return -EPERM;
        error = security_inode_symlink(dir, dentry, oldname);
@@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
-        if (!dir->i_op || !dir->i_op->link)
+        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
@@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
-        if (!old_dir->i_op || !old_dir->i_op->rename)
+        if (!old_dir->i_op->rename)
                return -EPERM;
        DQUOT_INIT(old_dir);
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
 *      @opts: an array of &struct option entries controlling parser operations
 *      @optopt: output; will contain the current option
 *      @optarg: output; will contain the value (if one exists)
- *      @flag: output; may be NULL; should point to a long for or'ing flags
 *      @value: output; may be NULL; will be overwritten with the integer value
 *              of the current argument.
 *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
 {
        s32             auth_type;
        u32             object_name_len;
-        compat_caddr_t  object_name;    /* an userspace data, in most cases user name */
+        compat_caddr_t  object_name;    /* a userspace data, in most cases user name */
 };
 struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..c903e04aa217 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        ret = set_groups(new, gi);
        put_group_info(gi);
-        if (!ret)
+        if (ret < 0)
                goto error;
-        if (new->uid)
+        if (new->fsuid)
                new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
        else
                new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c197..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-/* declarations */
-static const struct rpc_call_ops nfs4_cb_null_ops;
 /* Index of predefined Linux callback client operations */
 enum {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
                        nfsd4_encode_operation(resp, op);
                        status = op->status;
                }
+                dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+                        args->ops, args->opcnt, resp->opcnt, op->opnum,
+                        be32_to_cpu(status));
                if (cstate->replay_owner) {
                        nfs4_put_stateowner(cstate->replay_owner);
                        cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
        md5_to_hex(dname, cksum.data);
-        kfree(cksum.data);
        status = nfs_ok;
 out:
+        kfree(cksum.data);
        crypto_free_hash(desc.tfm);
 out_no_tfm:
        return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb8..88db7d3ec120 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
 #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
 #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
+static inline u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end: NFS4_MAX_UINT64;
+}
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+        u64 end;
+        BUG_ON(!len);
+        end = start + len;
+        return end > start ? end - 1: NFS4_MAX_UINT64;
+}
 #define lockownerid_hashval(id) \
        ((id) & LOCK_HASH_MASK)
@@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 static struct nfs4_stateid *
 find_stateid(stateid_t *stid, int flags)
 {
-        struct nfs4_stateid *local = NULL;
+        struct nfs4_stateid *local;
        u32 st_id = stid->si_stateownerid;
        u32 f_id = stid->si_fileid;
        unsigned int hashval;
        dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-        if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+        if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
                hashval = stateid_hashval(st_id, f_id);
                list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
                        if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
                                return local;
                }
        } 
-        if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+        if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
                hashval = stateid_hashval(st_id, f_id);
                list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
                        if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
                deny->ld_clientid.cl_id = 0;
        }
        deny->ld_start = fl->fl_start;
-        deny->ld_length = ~(u64)0;
+        deny->ld_length = NFS4_MAX_UINT64;
-        if (fl->fl_end != ~(u64)0)
+        if (fl->fl_end != NFS4_MAX_UINT64)
                deny->ld_length = fl->fl_end - fl->fl_start + 1;        
        deny->ld_type = NFS4_READ_LT;
        if (fl->fl_type != F_RDLCK)
@@ -2616,7 +2637,7 @@ out:
 static int
 check_lock_length(u64 offset, u64 length)
 {
-        return ((length == 0)  || ((length != ~(u64)0) &&
+        return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
             LOFF_OVERFLOW(offset, length)));
 }
@@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = lock->lk_offset;
-        if ((lock->lk_length == ~(u64)0) || 
+        file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
-                        LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
        nfs4_transform_lock_offset(&file_lock);
        /*
@@ -2781,6 +2798,25 @@ out:
 }
 /*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+        struct file *file;
+        int err;
+        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+        if (err)
+                return err;
+        err = vfs_test_lock(file, lock);
+        nfsd_close(file);
+        return err;
+}
+/*
 * LOCKT operation
 */
 __be32
@@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_lockt *lockt)
 {
        struct inode *inode;
-        struct file file;
        struct file_lock file_lock;
        int error;
        __be32 status;
@@ -2839,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = lockt->lt_offset;
-        if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
+        file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
        nfs4_transform_lock_offset(&file_lock);
-        /* vfs_test_lock uses the struct file _only_ to resolve the inode.
-         * since LOCKT doesn't require an OPEN, and therefore a struct
-         * file may not exist, pass vfs_test_lock a struct file with
-         * only the dentry:inode set.
-         */
-        memset(&file, 0, sizeof (struct file));
-        file.f_path.dentry = cstate->current_fh.fh_dentry;
        status = nfs_ok;
-        error = vfs_test_lock(&file, &file_lock);
+        error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
        if (error) {
                status = nfserrno(error);
                goto out;
@@ -2906,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = locku->lu_offset;
-        if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
+        file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
        nfs4_transform_lock_offset(&file_lock);
        /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs/nfs4xdr.c
- *
 *  Server-side XDR for NFSv4
 *
 *  Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Svc] = write_svc,
        [NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Getfd] = write_getfd,
        [NFSD_Getfs] = write_getfs,
        [NFSD_Fh] = write_filehandle,
-        [NFSD_FO_UnlockIP] = failover_unlock_ip,
+        [NFSD_FO_UnlockIP] = write_unlock_ip,
-        [NFSD_FO_UnlockFS] = failover_unlock_fs,
+        [NFSD_FO_UnlockFS] = write_unlock_fs,
        [NFSD_Threads] = write_threads,
        [NFSD_Pool_Threads] = write_pool_threads,
        [NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
 /*----------------------------------------------------------------------------*/
 /*
 * payload - write methods
- * If the method has a response, the response should be put in buf,
- * and the length returned.  Otherwise return 0 or and -error.
 */
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated.  /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_svc
+ *                              svc_port:       port number of this
+ *                                              server's listener
+ *                              svc_nthreads:   number of threads to start
+ *                      size:   size in bytes of passed in nfsctl_svc
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
        return nfsd_svc(data->svc_port, data->svc_nthreads);
 }
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_client
+ *                              cl_ident:       '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client
+ *                              cl_naddr:       no. of items in cl_addrlist
+ *                              cl_addrlist:    array of client addresses
+ *                              cl_fhkeytype:   ignored
+ *                              cl_fhkeylen:    ignored
+ *                              cl_fhkey:       ignored
+ *                      size:   size in bytes of passed in nfsctl_client
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_add(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
        return exp_addclient(data);
 }
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_client
+ *                              cl_ident:       '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client
+ *                              cl_naddr:       ignored
+ *                              cl_addrlist:    ignored
+ *                              cl_fhkeytype:   ignored
+ *                              cl_fhkeylen:    ignored
+ *                              cl_fhkey:       ignored
+ *                      size:   size in bytes of passed in nfsctl_client
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_del(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
        return exp_delclient(data);
 }
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_export
+ *                              ex_client:      '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client allowed to access
+ *                                              this export
+ *                              ex_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              ex_dev:         fsid to use for this export
+ *                              ex_ino:         ignored
+ *                              ex_flags:       export flags for this export
+ *                              ex_anon_uid:    UID to use for anonymous
+ *                                              requests
+ *                              ex_anon_gid:    GID to use for anonymous
+ *                                              requests
+ *                      size:   size in bytes of passed in nfsctl_export
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_export(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
        return exp_export(data);
 }
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_export
+ *                              ex_client:      '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client no longer allowed
+ *                                              to access this export
+ *                              ex_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              ex_dev:         ignored
+ *                              ex_ino:         ignored
+ *                              ex_flags:       ignored
+ *                              ex_anon_uid:    ignored
+ *                              ex_anon_gid:    ignored
+ *                      size:   size in bytes of passed in nfsctl_export
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
        return exp_unexport(data);
 }
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_fsparm
+ *                              gd_addr:        socket address of client
+ *                              gd_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              gd_maxlen:      maximum size of returned file
+ *                                              handle
+ *                      size:   size in bytes of passed in nfsctl_fsparm
+ * Output:
+ *      On success:     passed-in buffer filled with a knfsd_fh structure
+ *                      (a variable-length raw NFS file handle);
+ *                      return code is the size in bytes of the file handle
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        return err;
 }
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_fdparm
+ *                              gd_addr:        socket address of client
+ *                              gd_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              gd_version:     fdparm structure version
+ *                      size:   size in bytes of passed in nfsctl_fdparm
+ * Output:
+ *      On success:     passed-in buffer filled with nfsctl_res
+ *                      (a fixed-length raw NFS file handle);
+ *                      return code is the size in bytes of the file handle
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
        return err;
 }
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *                      buf:    '\n'-terminated C string containing a
+ *                              presentation format IPv4 address
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     returns zero if all specified locks were released;
+ *                      returns one if one or more locks were not released
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
        return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *                      buf:    '\n'-terminated C string containing the
+ *                              absolute pathname of a local file system
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     returns zero if all specified locks were released;
+ *                      returns one if one or more locks were not released
+ *      On error:       return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 {
        struct path path;
        char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
        if (error)
                return error;
+        /*
+         * XXX: Needs better sanity checking.  Otherwise we could end up
+         * releasing locks on the wrong file system.
+         *
+         * For example:
+         * 1.  Does the path refer to a directory?
+         * 2.  Is that directory a mount point, or
+         * 3.  Is that directory the root of an exported file system?
+         */
        error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
        path_put(&path);
        return error;
 }
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *                      buf:
+ *                              domain:         client domain name
+ *                              path:           export pathname
+ *                              maxsize:        numeric maximum size of
+ *                                              @buf
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing a ASCII hex text version
+ *                      of the NFS file handle;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 {
-        /* request is:
-         *   domain path maxsize
-         * response is
-         *   filehandle
-         *
-         * qword quoting is used, so filehandle will be \x....
-         */
        char *dname, *path;
        int uninitialized_var(maxsize);
        char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        dname = mesg;
        len = qword_get(&mesg, dname, size);
-        if (len <= 0) return -EINVAL;
+        if (len <= 0)
+                return -EINVAL;
        
        path = dname+len+1;
        len = qword_get(&mesg, path, size);
-        if (len <= 0) return -EINVAL;
+        if (len <= 0)
+                return -EINVAL;
        len = get_int(&mesg, &maxsize);
        if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        if (len)
                return len;
        
-        mesg = buf; len = SIMPLE_TRANSACTION_LIMIT;
+        mesg = buf;
+        len = SIMPLE_TRANSACTION_LIMIT;
        qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
        mesg[-1] = '\n';
        return mesg - buf;      
 }
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string numeric value representing the number of
+ *                      running NFSD threads;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the
+ *                                      number of NFSD threads to start
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service is started;
+ *                      passed-in buffer filled with '\n'-terminated C
+ *                      string numeric value representing the number of
+ *                      running NFSD threads;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
-        /* if size > 0, look for a number of threads and call nfsd_svc
-         * then write out number of threads as reply
-         */
        char *mesg = buf;
        int rv;
        if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                rv = get_int(&mesg, &newthreads);
                if (rv)
                        return rv;
-                if (newthreads <0)
+                if (newthreads < 0)
                        return -EINVAL;
-                rv = nfsd_svc(2049, newthreads);
+                rv = nfsd_svc(NFS_PORT, newthreads);
                if (rv)
                        return rv;
        }
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing whitespace-
+ *                                      separated unsigned integer values
+ *                                      representing the number of NFSD
+ *                                      threads to start in each pool
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing integer values representing the
+ *                      number of NFSD threads in each pool;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 {
        /* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
-        /*
-         * Format:
-         *   [-/+]vers [-/+]vers ...
-         */
        char *mesg = buf;
        char *vers, sign;
        int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        return len;
 }
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing positive or negative integer
+ *                      values representing the current status of each
+ *                      protocol version;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing whitespace-
+ *                                      separated positive or negative
+ *                                      integer values representing NFS
+ *                                      protocol versions to enable ("+n")
+ *                                      or disable ("-n")
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     status of zero or more protocol versions has
+ *                      been updated; passed-in buffer filled with
+ *                      '\n'-terminated C string containing positive
+ *                      or negative integer values representing the
+ *                      current status of each protocol version;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_versions(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
        return -EINVAL;
 }
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a whitespace-separated list of
+ *                      named NFSD listeners;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing a bound
+ *                                      but unconnected socket that is to be
+ *                                      used as an NFSD listener
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service is started;
+ *                      passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a unique alphanumeric name of
+ *                      the listener;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a "-" followed
+ *                                      by an integer value representing a
+ *                                      previously passed in socket file
+ *                                      descriptor
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service no longer listens on that socket;
+ *                      passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a unique name of the listener;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a transport
+ *                                      name and an unsigned integer value
+ *                                      representing the port to listen on,
+ *                                      separated by whitespace
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     returns zero; NFS service is started
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a "-" followed
+ *                                      by a transport name and an unsigned
+ *                                      integer value representing the port
+ *                                      to listen on, separated by whitespace
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     returns zero; NFS service no longer listens
+ *                      on that transport
+ *      On error:       return code is a negative errno value
+ */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 int nfsd_max_blksize;
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      NFS blksize
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing numeric value of the current NFS blksize
+ *                      setting;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      NFSv4 lease expiry time
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing unsigned integer value of the
+ *                      current lease expiry time;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing the pathname
+ *                                      of the directory on a local file
+ *                                      system containing permanent NFSv4
+ *                                      recovery data
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing the current recovery pathname setting;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
        return error;
 }
-/*
+/**
- * Perform sanity checks on the dentry in a client's file handle.
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
 *
- * Note that the file handle dentry may need to be freed even after
+ * fh_verify() may be called multiple times on a given filehandle, for
- * an error return.
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
 *
- * This is only called at the start of an nfsproc call, so fhp points to
+ * @type specifies the type of object expected using one of the S_IF*
- * a svc_fh which is all 0 except for the over-the-wire file handle.
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
 */
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                                goto retry;
                        break;
                }
+        } else if (exp->ex_flags & NFSEXP_FSID) {
+                fsid_type = FSID_NUM;
        } else if (exp->ex_uuid) {
                if (fhp->fh_maxsize >= 64) {
                        if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                        else
                                fsid_type = FSID_UUID4_INUM;
                }
-        } else if (exp->ex_flags & NFSEXP_FSID)
+        } else if (!old_valid_dev(ex_dev))
-                fsid_type = FSID_NUM;
-        else if (!old_valid_dev(ex_dev))
                /* for newer device numbers, we must use a newer fsid format */
                fsid_type = FSID_ENCODE_DEV;
        else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
                { nfserr_badname, -ESRCH },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
+                { nfserr_toosmall, -ETOOSMALL },
        };
        int     i;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
        return err;
 }
-        
 static int
 nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        dirp = dentry->d_inode;
        err = nfserr_notdir;
-        if(!dirp->i_op || !dirp->i_op->lookup)
+        if (!dirp->i_op->lookup)
                goto out;
        /*
         * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        /* Get all the sanity checks out of the way before
         * we lock the parent. */
        err = nfserr_notdir;
-        if(!dirp->i_op || !dirp->i_op->lookup)
+        if (!dirp->i_op->lookup)
                goto out;
        fh_lock_nested(fhp, I_MUTEX_PARENT);
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
        inode = dentry->d_inode;
        err = nfserr_inval;
-        if (!inode->i_op || !inode->i_op->readlink)
+        if (!inode->i_op->readlink)
                goto out;
        touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
        size_t size;
        int error;
-        if (!IS_POSIXACL(inode) || !inode->i_op ||
+        if (!IS_POSIXACL(inode) ||
            !inode->i_op->setxattr || !inode->i_op->removexattr)
                return -EOPNOTSUPP;
        switch(type) {
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 400f8064a548..81b8644b0136 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -704,7 +704,7 @@ fput_and_out:
        return ret;
 }
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
 {
        struct file *filp;
        struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                ni->allocated_size = sle64_to_cpu(
                                a->data.non_resident.allocated_size);
        }
-        /* Setup the operations for this attribute inode. */
-        vi->i_op = NULL;
-        vi->i_fop = NULL;
        if (NInoMstProtected(ni))
                vi->i_mapping->a_ops = &ntfs_mst_aops;
        else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
        aops.o                  \
+        blockcheck.o            \
        buffer_head_io.o        \
        dcache.o                \
        dir.o                   \
@@ -35,8 +36,14 @@ ocfs2-objs := \
        sysfile.o               \
        uptodate.o              \
        ver.o                   \
+        quota_local.o           \
+        quota_global.o          \
        xattr.o
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+        int n, count;
+        struct posix_acl *acl;
+        if (!value)
+                return NULL;
+        if (size < sizeof(struct posix_acl_entry))
+                return ERR_PTR(-EINVAL);
+        count = size / sizeof(struct posix_acl_entry);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_NOFS);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (n = 0; n < count; n++) {
+                struct ocfs2_acl_entry *entry =
+                        (struct ocfs2_acl_entry *)value;
+                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+                acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+                value += sizeof(struct posix_acl_entry);
+        }
+        return acl;
+}
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+        struct ocfs2_acl_entry *entry = NULL;
+        char *ocfs2_acl;
+        size_t n;
+        *size = acl->a_count * sizeof(struct posix_acl_entry);
+        ocfs2_acl = kmalloc(*size, GFP_NOFS);
+        if (!ocfs2_acl)
+                return ERR_PTR(-ENOMEM);
+        entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+        for (n = 0; n < acl->a_count; n++, entry++) {
+                entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+                entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+                entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+        }
+        return ocfs2_acl;
+}
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+                                              int type,
+                                              struct buffer_head *di_bh)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int name_index;
+        char *value = NULL;
+        struct posix_acl *acl;
+        int retval;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+        if (retval > 0) {
+                value = kmalloc(retval, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                                "", value, retval);
+        }
+        if (retval > 0)
+                acl = ocfs2_acl_from_xattr(value, retval);
+        else if (retval == -ENODATA || retval == 0)
+                acl = NULL;
+        else
+                acl = ERR_PTR(retval);
+        kfree(value);
+        return acl;
+}
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                acl = ERR_PTR(ret);
+                return acl;
+        }
+        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+        ocfs2_inode_unlock(inode, 0);
+        brelse(di_bh);
+        return acl;
+}
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+                         struct inode *inode,
+                         struct buffer_head *di_bh,
+                         int type,
+                         struct posix_acl *acl,
+                         struct ocfs2_alloc_context *meta_ac,
+                         struct ocfs2_alloc_context *data_ac)
+{
+        int name_index;
+        void *value = NULL;
+        size_t size = 0;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        ret = posix_acl_equiv_mode(acl, &mode);
+                        if (ret < 0)
+                                return ret;
+                        else {
+                                inode->i_mode = mode;
+                                if (ret == 0)
+                                        acl = NULL;
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                value = ocfs2_acl_to_xattr(acl, &size);
+                if (IS_ERR(value))
+                        return (int)PTR_ERR(value);
+        }
+        if (handle)
+                ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+                                             "", value, size, 0,
+                                             meta_ac, data_ac);
+        else
+                ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+        kfree(value);
+        return ret;
+}
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int ret = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return ret;
+        }
+        return -EAGAIN;
+}
+int ocfs2_acl_chmod(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl, *clone;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        ret = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!ret)
+                ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+                                    clone, NULL, NULL);
+        posix_acl_release(clone);
+        return ret;
+}
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+                   struct inode *inode,
+                   struct inode *dir,
+                   struct buffer_head *di_bh,
+                   struct buffer_head *dir_bh,
+                   struct ocfs2_alloc_context *meta_ac,
+                   struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl = NULL;
+        int ret = 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                        acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+                                                   dir_bh);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        ret = ocfs2_set_acl(handle, inode, di_bh,
+                                            ACL_TYPE_DEFAULT, acl,
+                                            meta_ac, data_ac);
+                        if (ret)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                ret = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                mode = inode->i_mode;
+                ret = posix_acl_create_masq(clone, &mode);
+                if (ret >= 0) {
+                        inode->i_mode = mode;
+                        if (ret > 0) {
+                                ret = ocfs2_set_acl(handle, inode,
+                                                    di_bh, ACL_TYPE_ACCESS,
+                                                    clone, meta_ac, data_ac);
+                        }
+                }
+                posix_acl_release(clone);
+        }
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+                                          char *list,
+                                          size_t list_len,
+                                          const char *name,
+                                          size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+        return size;
+}
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+                                           char *list,
+                                           size_t list_len,
+                                           const char *name,
+                                           size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+        return size;
+}
+static int ocfs2_xattr_get_acl(struct inode *inode,
+                               int type,
+                               void *buffer,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        acl = ocfs2_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        ret = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+                                       const char *name,
+                                       void *buffer,
+                                       size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+static int ocfs2_xattr_set_acl(struct inode *inode,
+                               int type,
+                               const void *value,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret = 0;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        ret = posix_acl_valid(acl);
+                        if (ret)
+                                goto cleanup;
+                }
+        } else
+                acl = NULL;
+        ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+                                      const char *name,
+                                      const void *value,
+                                      size_t size,
+                                      int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+                                       const char *name,
+                                       const void *value,
+                                       size_t size,
+                                       int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .list   = ocfs2_xattr_list_acl_access,
+        .get    = ocfs2_xattr_get_acl_access,
+        .set    = ocfs2_xattr_set_acl_access,
+};
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .list   = ocfs2_xattr_list_acl_default,
+        .get    = ocfs2_xattr_get_acl_default,
+        .set    = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+#include <linux/posix_acl_xattr.h>
+struct ocfs2_acl_entry {
+        __le16 e_tag;
+        __le16 e_perm;
+        __le32 e_id;
+};
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+                          struct buffer_head *, struct buffer_head *,
+                          struct ocfs2_alloc_context *,
+                          struct ocfs2_alloc_context *);
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+                                 struct inode *inode,
+                                 struct inode *dir,
+                                 struct buffer_head *di_bh,
+                                 struct buffer_head *dir_bh,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct ocfs2_alloc_context *data_ac)
+{
+        return 0;
+}
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -36,6 +37,7 @@
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -46,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
                                     struct ocfs2_extent_tree *et)
 {
-        int ret = 0;
+        struct ocfs2_dinode *di = et->et_object;
-        struct ocfs2_dinode *di;
        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
-        di = et->et_object;
+        return 0;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                ret = -EIO;
-                ocfs2_error(inode->i_sb,
-                        "Inode %llu has invalid path root",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        }
-        return ret;
 }
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv = et->et_object;
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-        et->et_root_el = &xv->xr_list;
+        et->et_root_el = &vb->vb_xv->xr_list;
 }
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                              u64 blkno)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        xv->xr_last_eb_blk = cpu_to_le64(blkno);
+        vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *) et->et_object;
-        return le64_to_cpu(xv->xr_last_eb_blk);
+        return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
                                              struct ocfs2_extent_tree *et,
                                              u32 clusters)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        le32_add_cpu(&xv->xr_clusters, clusters);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
+                                     ocfs2_journal_access_func access,
                                     void *obj,
                                     struct ocfs2_extent_tree_operations *ops)
 {
        et->et_ops = ops;
        et->et_root_bh = bh;
+        et->et_root_journal_access = access;
        if (!obj)
                obj = (void *)bh->b_data;
        et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
                                   struct inode *inode,
                                   struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+                                 NULL, &ocfs2_dinode_et_ops);
 }
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL,
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
-                                 &ocfs2_xattr_tree_et_ops);
+                                 NULL, &ocfs2_xattr_tree_et_ops);
 }
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb)
-                                        struct ocfs2_xattr_value_root *xv)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, xv,
+        __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
                                 &ocfs2_xattr_value_et_ops);
 }
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
        et->et_ops->eo_update_clusters(inode, et, clusters);
 }
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+                                               struct inode *inode,
+                                               struct ocfs2_extent_tree *et,
+                                               int type)
+{
+        return et->et_root_journal_access(handle, inode, et->et_root_bh,
+                                          type);
+}
 static inline int ocfs2_et_insert_check(struct inode *inode,
                                        struct ocfs2_extent_tree *et,
                                        struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH    5
 struct ocfs2_path {
-        int                     p_tree_depth;
+        int                             p_tree_depth;
-        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
+        ocfs2_journal_access_func       p_root_access;
+        struct ocfs2_path_item          p_node[OCFS2_MAX_PATH_DEPTH];
 };
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
         */
        if (keep_root)
                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        else
+                path_root_access(path) = NULL;
        path->p_tree_depth = depth;
 }
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
        BUG_ON(path_root_el(dest) != path_root_el(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        ocfs2_reinit_path(dest, 1);
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        int i;
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
                brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-                                         struct ocfs2_extent_list *root_el)
+                                         struct ocfs2_extent_list *root_el,
+                                         ocfs2_journal_access_func access)
 {
        struct ocfs2_path *path;
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
                get_bh(root_bh);
                path_root_bh(path) = root_bh;
                path_root_el(path) = root_el;
+                path_root_access(path) = access;
        }
        return path;
 }
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+        return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+                              path_root_access(path));
+}
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+        return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+                              et->et_root_journal_access);
+}
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+                                        struct inode *inode,
+                                        struct ocfs2_path *path,
+                                        int idx)
+{
+        ocfs2_journal_access_func access = path_root_access(path);
+        if (!access)
+                access = ocfs2_journal_access;
+        if (idx)
+                access = ocfs2_journal_access_eb;
+        return access(handle, inode, path->p_node[idx].bh,
+                      OCFS2_JOURNAL_ACCESS_WRITE);
+}
 /*
 * Convenience function to journal all components in a path.
 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
                goto out;
        for(i = 0; i < path_num_items(path); i++) {
-                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
        int                     c_split_covers_rec;
 };
+static int ocfs2_validate_extent_block(struct super_block *sb,
+                                       struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_extent_block *eb =
+                (struct ocfs2_extent_block *)bh->b_data;
+        mlog(0, "Validating extent block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return rc;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has bad signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            eb->h_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid h_blkno "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(eb->h_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid "
+                            "h_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(eb->h_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+                              ocfs2_validate_extent_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /*
 * How many free extents have we got before we need more meta data?
 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
        if (last_eb_blk) {
-                retval = ocfs2_read_block(inode, last_eb_blk,
+                retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
-                                          &eb_bh);
                if (retval < 0) {
                        mlog_errno(retval);
                        goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        }
                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-                        status = ocfs2_journal_access(handle, inode, bhs[i],
+                        status = ocfs2_journal_access_eb(handle, inode, bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                                         OCFS2_JOURNAL_ACCESS_CREATE);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        for(i = 0; i < new_blocks; i++) {
                bh = new_eb_bhs[i];
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                /* ocfs2_create_new_meta_bhs() should create it right! */
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                        status = -EIO;
-                        goto bail;
-                }
                eb_el = &eb->h_list;
-                status = ocfs2_journal_access(handle, inode, bh,
+                status = ocfs2_journal_access_eb(handle, inode, bh,
-                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                                                 OCFS2_JOURNAL_ACCESS_CREATE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        if (eb_bh) {
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        }
        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        /* ocfs2_create_new_meta_bhs() should create it right! */
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                status = -EIO;
-                goto bail;
-        }
        eb_el = &eb->h_list;
        root_el = et->et_root_el;
-        status = ocfs2_journal_access(handle, inode, new_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                brelse(bh);
                bh = NULL;
-                status = ocfs2_read_block(inode, blkno, &bh);
+                status = ocfs2_read_extent_block(inode, blkno, &bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        status = -EIO;
-                        goto bail;
-                }
                el = &eb->h_list;
                if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
                brelse(bh);
                bh = NULL;
-                ret = ocfs2_read_block(inode, blkno, &bh);
+                ret = ocfs2_read_extent_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out;
-                }
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
        root_bh = left_path->p_node[subtree_index].bh;
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
        *ret_left_path = NULL;
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                        return -EAGAIN;
                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_journal_access_eb(handle, inode,
-                                                   path_leaf_bh(right_path),
+                                                      path_leaf_bh(right_path),
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                 * We have to update i_last_eb_blk during the meta
                 * data delete.
                 */
-                ret = ocfs2_journal_access(handle, inode, et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
         */
        BUG_ON(right_has_empty && !del_right_subtree);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2596,16 +2694,17 @@ out:
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head *bh,
+                                            struct ocfs2_path *path)
-                                            struct ocfs2_extent_list *el)
 {
        int ret;
+        struct buffer_head *bh = path_leaf_bh(path);
+        struct ocfs2_extent_list *el = path_leaf_el(path);
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
                return 0;
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                goto out;
        }
-        left_path = ocfs2_new_path(path_root_bh(path),
+        left_path = ocfs2_new_path_from_path(path);
-                                   path_root_el(path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
        ocfs2_cp_path(left_path, path);
-        right_path = ocfs2_new_path(path_root_bh(path),
+        right_path = ocfs2_new_path_from_path(path);
-                                    path_root_el(path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                 * Caller might still want to make changes to the
                 * tree root, so re-add it to the journal here.
                 */
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           path_root_bh(left_path),
+                                                   left_path, 0);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                 * We have a path to the left of this one - it needs
                 * an update too.
                 */
-                left_path = ocfs2_new_path(path_root_bh(path),
+                left_path = ocfs2_new_path_from_path(path);
-                                           path_root_el(path));
                if (!left_path) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
                 * it up front.
                 */
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-                                                       path_leaf_bh(path),
+                                                       path);
-                                                       path_leaf_el(path));
                if (ret)
                        mlog_errno(ret);
                goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
        /* This function shouldn't be called for the rightmost leaf. */
        BUG_ON(right_cpos == 0);
-        right_path = ocfs2_new_path(path_root_bh(left_path),
+        right_path = ocfs2_new_path_from_path(left_path);
-                                    path_root_el(left_path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                right_rec = &el->l_recs[index + 1];
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(left_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
        /* This function shouldn't be called for the leftmost leaf. */
        BUG_ON(left_cpos == 0);
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                        has_empty_extent = 1;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(right_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                 * leftmost leaf.
                 */
                if (left_cpos) {
-                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                        left_path = ocfs2_new_path_from_path(right_path);
-                                                   path_root_el(right_path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
        struct ocfs2_extent_rec *rec, *tmprec;
-        right_el = path_leaf_el(right_path);;
+        right_el = path_leaf_el(right_path);
        if (left_path)
                left_el = path_leaf_el(left_path);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        el = et->et_root_el;
-        ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                goto out_update_clusters;
        }
-        right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        right_path = ocfs2_new_path_from_et(et);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                 * ocfs2_rotate_tree_right() might have extended the
                 * transaction without re-journaling our tree root.
                 */
-                ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        goto out;
                if (left_cpos != 0) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path)
                                goto out;
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                            le16_to_cpu(new_el->l_count)) {
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of "
+                                            "%d.  It should have "
+                                            "matched the l_count of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec),
+                                            le16_to_cpu(new_el->l_count));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                if (right_cpos == 0)
                        goto out;
-                right_path = ocfs2_new_path(path_root_bh(path),
+                right_path = ocfs2_new_path_from_path(path);
-                                            path_root_el(path));
                if (!right_path)
                        goto out;
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
                 * may want it later.
                 */
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+                ret = ocfs2_read_extent_block(inode,
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                return 0;
        }
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
        BUG_ON(num_bits > clusters_to_add);
-        /* reserve our write early -- insert_extent may update the inode */
+        /* reserve our write early -- insert_extent may update the tree root */
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        if (path->p_tree_depth) {
                struct ocfs2_extent_block *eb;
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EROFS;
-                        goto out;
-                }
                rightmost_el = &eb->h_list;
        } else
                rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
        if (et->et_ops == &ocfs2_dinode_et_ops)
                ocfs2_extent_map_trunc(inode, 0);
-        left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        left_path = ocfs2_new_path_from_et(et);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
        depth = path->p_tree_depth;
        if (depth > 0) {
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
                }
                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
        ocfs2_extent_map_trunc(inode, 0);
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
        return ret;
 }
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_et_update_clusters(inode, et, -len);
+        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        tl_count = le16_to_cpu(tl->tl_count);
        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
                        tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+        status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        while (i >= 0) {
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
-                status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+                status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto out;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        num_to_flush = le16_to_cpu(tl->tl_used);
        mlog(0, "Flush %u records from truncate log #%llu\n",
             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                iput(inode);
                mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        }
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+         * validated by the underlying call to ocfs2_read_inode_block(),
+         * so any corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        if (le16_to_cpu(tl->tl_used)) {
                mlog(0, "We'll have %u logs to recover\n",
                     le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                 * tl_used. */
                tl->tl_used = 0;
+                ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
                status = ocfs2_write_block(osb, tl_bh, tl_inode);
                if (status < 0) {
                        mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
        struct ocfs2_cached_block_free          *f_first;
 };
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
-                                   int sysfile_type,
+                                    int sysfile_type,
-                                   int slot,
+                                    int slot,
-                                   struct ocfs2_cached_block_free *head)
+                                    struct ocfs2_cached_block_free *head)
 {
        int ret;
        u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
        return ret;
 }
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit)
+{
+        int ret = 0;
+        struct ocfs2_cached_block_free *item;
+        item = kmalloc(sizeof(*item), GFP_NOFS);
+        if (item == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+             bit, (unsigned long long)blkno);
+        item->free_blk = blkno;
+        item->free_bit = bit;
+        item->free_next = ctxt->c_global_allocator;
+        ctxt->c_global_allocator = item;
+        return ret;
+}
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+                                      struct ocfs2_cached_block_free *head)
+{
+        struct ocfs2_cached_block_free *tmp;
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        int ret = 0;
+        mutex_lock(&tl_inode->i_mutex);
+        while (head) {
+                if (ocfs2_truncate_log_needs_flush(osb)) {
+                        ret = __ocfs2_flush_truncate_log(osb);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                }
+                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+                                                head->free_bit);
+                ocfs2_commit_trans(osb, handle);
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        mutex_unlock(&tl_inode->i_mutex);
+        while (head) {
+                /* Premature exit may have left some dangling items. */
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+        return ret;
+}
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                if (fl->f_first) {
                        mlog(0, "Free items: (type %u, slot %d)\n",
                             fl->f_inode_type, fl->f_slot);
-                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                        ret2 = ocfs2_free_cached_blocks(osb,
-                                                       fl->f_slot, fl->f_first);
+                                                        fl->f_inode_type,
+                                                        fl->f_slot,
+                                                        fl->f_first);
                        if (ret2)
                                mlog_errno(ret2);
                        if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                kfree(fl);
        }
+        if (ctxt->c_global_allocator) {
+                ret2 = ocfs2_free_cached_clusters(osb,
+                                                  ctxt->c_global_allocator);
+                if (ret2)
+                        mlog_errno(ret2);
+                if (!ret)
+                        ret = ret2;
+                ctxt->c_global_allocator = NULL;
+        }
        return ret;
 }
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
        eb = (struct ocfs2_extent_block *) bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-                ret = -EROFS;
+         * Any corruption is a code bug. */
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        }
        *new_last_eb = bh;
        get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
        if (last_eb_bh) {
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                goto bail;
        }
+        vfs_dq_free_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
                mlog_errno(ret);
        else if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle, page_buffers(page),
-                                        from, to, &partial,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        struct page **pages = NULL;
        loff_t end = osb->s_clustersize;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        has_data = i_size_read(inode) ? 1 : 0;
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_inline_to_extents_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out_unlock;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                unsigned int page_end;
                u64 phys;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        ret = -EDQUOT;
+                        goto out_commit;
+                }
+                did_quota = 1;
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        }
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                          ocfs2_clusters_to_bytes(osb->sb, 1));
        ocfs2_commit_trans(osb, handle);
 out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
                mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+                status = ocfs2_read_extent_block(inode,
-                                          &last_eb_bh);
+                                                 le64_to_cpu(fe->i_last_eb_blk),
+                                                 &last_eb_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(last_eb_bh);
-                        status = -EIO;
-                        goto bail;
-                }
        }
        (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
 *
 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
 * ocfs2_extent_tree_operations abstract the normal operations we do for
 * the root of extent b-tree.
 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
        struct ocfs2_extent_tree_operations     *et_ops;
        struct buffer_head                      *et_root_bh;
        struct ocfs2_extent_list                *et_root_el;
+        ocfs2_journal_access_func               et_root_journal_access;
        void                                    *et_object;
        unsigned int                            et_max_leaf_clusters;
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb);
-                                        struct ocfs2_xattr_value_root *xv);
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh);
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
 */
 struct ocfs2_cached_dealloc_ctxt {
        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+        struct ocfs2_cached_block_free          *c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
        c->c_first_suballocator = NULL;
+        c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+        return c->c_global_allocator != NULL;
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                goto bail;
-        }
        if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
                                                    le32_to_cpu(fe->i_clusters))) {
                mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle,
-                                        page_buffers(page),
-                                        from, to, NULL,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
                tmppage = wc->w_pages[i];
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                ocfs2_commit_trans(osb, handle);
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        wc->w_handle = handle;
+        if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
        /*
         * We don't want this to fail in ocfs2_write_end(), so do it
         * here.
         */
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        if (data_ac)
@@ -1790,6 +1776,10 @@ success:
        *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
+out_quota:
+        if (clusters_to_alloc)
+                vfs_dq_free_space(inode,
+                          ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
        }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "blockcheck.h"
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+        unsigned int b, p = 0;
+        /*
+         * Data bits are 0-based, but we're talking code bits, which
+         * are 1-based.
+         */
+        b = i + 1;
+        /* Use the cache if it is there */
+        if (p_cache)
+                p = *p_cache;
+        b += p;
+        /*
+         * For every power of two below our bit number, bump our bit.
+         *
+         * We compare with (b + 1) because we have to compare with what b
+         * would be _if_ it were bumped up by the parity bit.  Capice?
+         *
+         * p is set above.
+         */
+        for (; (1 << p) < (b + 1); p++)
+                b++;
+        if (p_cache)
+                *p_cache = p;
+        return b;
+}
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+        unsigned int i, b, p = 0;
+        BUG_ON(!d);
+        /*
+         * b is the hamming code bit number.  Hamming code specifies a
+         * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+         * for the algorithm.
+         *
+         * The i++ in the for loop is so that the start offset passed
+         * to ocfs2_find_next_bit_set() is one greater than the previously
+         * found bit.
+         */
+        for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+        {
+                /*
+                 * i is the offset in this hunk, nr + i is the total bit
+                 * offset.
+                 */
+                b = calc_code_bit(nr + i, &p);
+                /*
+                 * Data bits in the resultant code are checked by
+                 * parity bits that are part of the bit number
+                 * representation.  Huh?
+                 *
+                 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+                 * In other words, the parity bit at position 2^k
+                 * checks bits in positions having bit k set in
+                 * their binary representation.  Conversely, for
+                 * instance, bit 13, i.e. 1101(2), is checked by
+                 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+                 * </wikipedia>
+                 *
+                 * Note that 'k' is the _code_ bit number.  'b' in
+                 * our loop.
+                 */
+                parity ^= b;
+        }
+        /* While the data buffer was treated as little endian, the
+         * return value is in host endian. */
+        return parity;
+}
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+        return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix)
+{
+        unsigned int i, b;
+        BUG_ON(!d);
+        /*
+         * If the bit to fix has an hweight of 1, it's a parity bit.  One
+         * busted parity bit is its own error.  Nothing to do here.
+         */
+        if (hweight32(fix) == 1)
+                return;
+        /*
+         * nr + d is the bit right past the data hunk we're looking at.
+         * If fix after that, nothing to do
+         */
+        if (fix >= calc_code_bit(nr + d, NULL))
+                return;
+        /*
+         * nr is the offset in the data hunk we're starting at.  Let's
+         * start b at the offset in the code buffer.  See hamming_encode()
+         * for a more detailed description of 'b'.
+         */
+        b = calc_code_bit(nr, NULL);
+        /* If the fix is before this hunk, nothing to do */
+        if (fix < b)
+                return;
+        for (i = 0; i < d; i++, b++)
+        {
+                /* Skip past parity bits */
+                while (hweight32(b) == 1)
+                        b++;
+                /*
+                 * i is the offset in this data hunk.
+                 * nr + i is the offset in the total data buffer.
+                 * b is the offset in the total code buffer.
+                 *
+                 * Thus, when b == fix, bit i in the current hunk needs
+                 * fixing.
+                 */
+                if (b == fix)
+                {
+                        if (ocfs2_test_bit(i, data))
+                                ocfs2_clear_bit(i, data);
+                        else
+                                ocfs2_set_bit(i, data);
+                        break;
+                }
+        }
+}
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                             unsigned int fix)
+{
+        ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        u32 crc;
+        u32 ecc;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        crc = crc32_le(~0, data, blocksize);
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+        /* And check the crc32 again */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i;
+        u32 crc, ecc;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i, rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc, fix;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return 0;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        for (i = 0, ecc = 0; i < nr; i++) {
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        fix = ecc ^ check.bc_ecc;
+        for (i = 0; i < nr; i++) {
+                /*
+                 * Try the fix against each buffer.  It will only affect
+                 * one of them.
+                 */
+                ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+                                  bhs[i]->b_size * 8 * i, fix);
+        }
+        /* And check the crc32 again */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+        return rc;
+}
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+        return rc;
+}
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+/*
+ * Hamming code functions
+ */
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+                         unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix);
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                                    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
 #include "buffer_head_io.h"
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+        BH_NeedsValidate = BH_JBDPrivateStart,
+};
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                      struct inode *inode)
 {
@@ -166,7 +178,9 @@ bail:
 }
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-                      struct buffer_head *bhs[], int flags)
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh))
 {
        int status = 0;
        int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
+                        if (validate)
+                                set_buffer_needs_validate(bh);
                        bh->b_end_io = end_buffer_read_sync;
                        submit_bh(READ, bh);
                        continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                                bhs[i] = NULL;
                                continue;
                        }
+                        if (buffer_needs_validate(bh)) {
+                                /* We never set NeedsValidate if the
+                                 * buffer was held by the journal, so
+                                 * that better not have changed */
+                                BUG_ON(buffer_jbd(bh));
+                                clear_buffer_needs_validate(bh);
+                                status = validate(inode->i_sb, bh);
+                                if (status) {
+                                        put_bh(bh);
+                                        bhs[i] = NULL;
+                                        continue;
+                                }
+                        }
                }
                /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
                             int uptodate);
-static inline int ocfs2_read_block(struct inode        *inode,
-                                   u64                  off,
-                                   struct buffer_head **bh);
 int ocfs2_write_block(struct ocfs2_super          *osb,
                      struct buffer_head  *bh,
                      struct inode        *inode);
-int ocfs2_read_blocks(struct inode        *inode,
-                      u64                  block,
-                      int                  nr,
-                      struct buffer_head  *bhs[],
-                      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                           unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh));
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh);
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-                                   struct buffer_head **bh)
+                                   struct buffer_head **bh,
+                                   int (*validate)(struct super_block *sb,
+                                                   struct buffer_head *bh))
 {
        int status = 0;
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+        status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 bail:
        return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
-                 * o2hb_do_disk_heartbeat so that we avoid more then
+                 * o2hb_do_disk_heartbeat so that we avoid more than
                 * hr_timeout_ms between disk writes. On busy systems
                 * this should result in a heartbeat which is less
                 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUORUM),
        define_mask(EXPORT),
        define_mask(XATTR),
+        define_mask(QUOTA),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_QUORUM       0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT       0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
-static struct buffer_head *ocfs2_bread(struct inode *inode,
+/*
-                                       int block, int *err, int reada)
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
-        struct buffer_head *bh = NULL;
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-        int tmperr;
+                return 0;
-        u64 p_blkno;
-        int readflags = 0;
-        if (reada)
+        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
-                readflags |= OCFS2_BH_READAHEAD;
+}
-        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
-            i_size_read(inode)) {
+{
-                BUG_ON(!reada);
+        return ocfs2_meta_ecc(osb);
-                return NULL;
+}
-        }
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+{
-                                             NULL);
+        return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+}
-        if (tmperr < 0) {
-                mlog_errno(tmperr);
-                goto fail;
-        }
-        tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
-        if (tmperr < 0)
-                goto fail;
-        tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data)
+{
+        char *p = data;
-        *err = 0;
+        p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        return bh;
+        return (struct ocfs2_dir_block_trailer *)p;
+}
-fail:
+/*
-        brelse(bh);
+ * XXX: This is executed once on every dirent. We should consider optimizing
-        bh = NULL;
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+                                  struct ocfs2_dir_entry *de,
+                                  unsigned long offset,
+                                  unsigned long blklen)
+{
+        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        *err = -EIO;
+        if (!ocfs2_dir_has_trailer(dir))
-        return NULL;
+                return 0;
+        if (offset != toff)
+                return 0;
+        return 1;
+}
+static void ocfs2_init_dir_trailer(struct inode *inode,
+                                   struct buffer_head *bh)
+{
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+        strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+        trailer->db_compat_rec_len =
+                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -250,6 +277,108 @@ out:
        return NULL;
 }
+static int ocfs2_validate_dir_block(struct super_block *sb,
+                                    struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(bh, sb);
+        /*
+         * We don't validate dirents here, that's handled
+         * in-place when the code walks them.
+         */
+        mlog(0, "Validating dirblock %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         *
+         * Note that we are safe to call this even if the directory
+         * doesn't have a trailer.  Filesystems without metaecc will do
+         * nothing, and filesystems with it will have one.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+        if (rc)
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        return rc;
+}
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+                                struct buffer_head **bh, int flags)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        struct ocfs2_dir_block_trailer *trailer;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+                                    ocfs2_validate_dir_block);
+        if (rc) {
+                mlog_errno(rc);
+                goto out;
+        }
+        /*
+         * We check the trailer here rather than in
+         * ocfs2_validate_dir_block() because that function doesn't have
+         * the inode to test.
+         */
+        if (!(flags & OCFS2_BH_READAHEAD) &&
+            ocfs2_dir_has_trailer(inode)) {
+                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Invalid dirblock #%llu: "
+                                    "signature = %.*s\n",
+                                    (unsigned long long)tmp->b_blocknr, 7,
+                                    trailer->db_signature);
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu has an invalid "
+                                    "db_blkno of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_parent_dinode) !=
+                    OCFS2_I(inode)->ip_blkno) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu on dinode "
+                                    "#%llu has an invalid parent_dinode "
+                                    "of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+        }
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc ? -EIO : 0;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
                                }
                                num++;
-                                bh = ocfs2_bread(dir, b++, &err, 1);
+                                bh = NULL;
+                                err = ocfs2_read_dir_block(dir, b++, &bh,
+                                                           OCFS2_BH_READAHEAD);
                                bh_use[ra_max] = bh;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
-                if (ocfs2_read_block(dir, block, &bh)) {
+                if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
                        /* read error, skip block & hope for the best.
-                         * ocfs2_read_block() has released the bh. */
+                         * ocfs2_read_dir_block() has released the bh. */
                        ocfs2_error(dir->i_sb, "reading directory %llu, "
                                    "offset %lu\n",
                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
                       struct inode *new_entry_inode)
 {
        int ret;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        /*
         * The same code works fine for both inline-data and extent
-         * based directories, so no need to split this up.
+         * based directories, so no need to split this up.  The only
+         * difference is the journal_access function.
         */
-        ret = ocfs2_journal_access(handle, dir, de_bh,
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                access = ocfs2_journal_access_di;
+        ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
        struct ocfs2_dir_entry *de, *pde;
        int i, status = -ENOENT;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                access = ocfs2_journal_access_di;
        i = 0;
        pde = NULL;
        de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        goto bail;
                }
                if (de == de_del)  {
-                        status = ocfs2_journal_access(handle, dir, bh,
+                        status = access(handle, dir, bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
                        if (status < 0) {
                                status = -EIO;
                                mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
                        goto bail;
                }
+                /* We're guaranteed that we should have space, so we
+                 * can't possibly have hit the trailer...right? */
+                mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+                                "Hit dir trailer trying to insert %.*s "
+                                "(namelen %d) into directory %llu.  "
+                                "offset is %lu, trailer offset is %d\n",
+                                namelen, name, namelen,
+                                (unsigned long long)parent_fe_bh->b_blocknr,
+                                offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
                        retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
                                goto bail;
                        }
-                        status = ocfs2_journal_access(handle, dir, insert_bh,
+                        if (insert_bh == parent_fe_bh)
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                status = ocfs2_journal_access_di(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                        else
+                                status = ocfs2_journal_access_db(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
                        retval = 0;
                        goto bail;
                }
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
        }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        struct ocfs2_inline_data *data;
        struct ocfs2_dir_entry *de;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        int i, stored;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
-        int err;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        while (!error && !stored && *f_pos < i_size_read(inode)) {
                blk = (*f_pos) >> sb->s_blocksize_bits;
-                bh = ocfs2_bread(inode, blk, &err, 0);
+                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
-                if (!bh) {
+                        /* Skip the corrupt dirblock and keep trying */
-                        mlog(ML_ERROR,
-                             "directory #%llu contains a hole at offset %lld\n",
-                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                             *f_pos);
                        *f_pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
-                                tmp = ocfs2_bread(inode, ++blk, &err, 1);
+                                tmp = NULL;
-                                brelse(tmp);
+                                if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+                                                          OCFS2_BH_READAHEAD))
+                                        brelse(tmp);
                        }
                        last_ra_blk = blk;
                        ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
                }
                offset = 0;
                brelse(bh);
+                bh = NULL;
        }
        stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
        return !priv.seen_other;
 }
-static void ocfs2_fill_initial_dirents(struct inode *inode,
+/*
-                                       struct inode *parent,
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
-                                       char *start, unsigned int size)
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+                                                          struct inode *parent,
+                                                          char *start,
+                                                          unsigned int size)
 {
        struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
        de->name_len = 2;
        strcpy(de->name, "..");
        ocfs2_set_de_type(de, S_IFDIR);
+        return de;
 }
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        struct ocfs2_inline_data *data = &di->id2.i_data;
        unsigned int size = le16_to_cpu(data->id_count);
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct ocfs2_alloc_context *data_ac)
 {
        int status;
+        unsigned int size = osb->sb->s_blocksize;
        struct buffer_head *new_bh = NULL;
+        struct ocfs2_dir_entry *de;
        mlog_entry_void();
+        if (ocfs2_supports_dir_trailer(osb))
+                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
                                     data_ac, NULL, &new_bh);
        if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        ocfs2_set_new_buffer_uptodate(inode, new_bh);
-        status = ocfs2_journal_access(handle, inode, new_bh,
+        status = ocfs2_journal_access_db(handle, inode, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
-        ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
+        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-                                   osb->sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(inode, new_bh);
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                                     data_ac);
 }
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     unsigned int new_size)
+                                     struct super_block *sb)
 {
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
-        unsigned int bytes = new_size - old_size;
+        unsigned int new_size = sb->s_blocksize;
+        unsigned int bytes;
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                new_size = ocfs2_dir_trailer_blk_off(sb);
+        bytes = new_size - old_size;
        limit = start + old_size;
        de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
                                   struct buffer_head **first_block_bh)
 {
-        int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
        u32 alloc, bit_off, len;
        struct super_block *sb = dir->i_sb;
+        int ret, credits = ocfs2_inline_to_extents_credits(sb);
        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_sem;
        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
-        ret = ocfs2_journal_access(handle, dir, dirdata_bh,
+        ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
+        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
-                                 sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(dir, dirdata_bh);
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * We let the later dirent insert modify c/mtime - to the user
         * the data hasn't changed.
         */
-        ret = ocfs2_journal_access(handle, dir, di_bh,
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        dirdata_bh = NULL;
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct buffer_head **new_bh)
 {
        int status;
-        int extend;
+        int extend, did_quota = 0;
        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        if (extend) {
                u32 offset = OCFS2_I(dir)->ip_clusters;
+                if (vfs_dq_alloc_space_nodirty(dir,
+                                        ocfs2_clusters_to_bytes(sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
                                              1, 0, parent_fe_bh, handle,
                                              data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        }
        status = 0;
 bail:
+        if (did_quota && status < 0)
+                vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
        mlog_exit(status);
        return status;
 }
@@ -1569,16 +1771,22 @@ do_extend:
        ocfs2_set_new_buffer_uptodate(dir, new_bh);
-        status = ocfs2_journal_access(handle, dir, new_bh,
+        status = ocfs2_journal_access_db(handle, dir, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, sb->s_blocksize);
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(sb->s_blocksize);
+        if (ocfs2_dir_has_trailer(dir)) {
+                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+                ocfs2_init_dir_trailer(dir, new_bh);
+        } else {
+                de->rec_len = cpu_to_le16(sb->s_blocksize);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int *blocks_wanted)
 {
        int ret;
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_dir_entry *de, *last_de = NULL;
        char *de_buf, *limit;
        unsigned long offset = 0;
-        unsigned int rec_len, new_rec_len;
+        unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+        /*
+         * This calculates how many free bytes we'd have in block zero, should
+         * this function force expansion to an extent tree.
+         */
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+        else
+                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
        de_buf = di->id2.i_data.id_data;
        limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                        ret = -EEXIST;
                        goto out;
                }
+                /*
+                 * No need to check for a trailing dirent record here as
+                 * they're not used for inline dirs.
+                 */
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * dirent can be found.
         */
        *blocks_wanted = 1;
-        new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+        new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
        if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
                *blocks_wanted = 2;
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        struct ocfs2_dir_entry *de;
        struct super_block *sb = dir->i_sb;
        int status;
+        int blocksize = dir->i_sb->s_blocksize;
-        bh = ocfs2_bread(dir, 0, &status, 0);
+        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (!bh) {
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                                status = -ENOSPC;
                                goto bail;
                        }
-                        bh = ocfs2_bread(dir,
+                        status = ocfs2_read_dir_block(dir,
-                                         offset >> sb->s_blocksize_bits,
+                                             offset >> sb->s_blocksize_bits,
-                                         &status,
+                                             &bh, 0);
-                                         0);
+                        if (status) {
-                        if (!bh) {
                                mlog_errno(status);
                                goto bail;
                        }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = -EEXIST;
                        goto bail;
                }
+                if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+                                           blocksize))
+                        goto next;
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = 0;
                        goto bail;
                }
+next:
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       struct buffer_head *fe_bh,
                       struct ocfs2_alloc_context *data_ac);
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data);
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        struct list_head *iter, *head=NULL;
        u64 cookie;
        u32 flags;
+        u8 node;
        if (!dlm_grab(dlm)) {
                dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        name = past->name;
        locklen = past->namelen;
-        cookie = be64_to_cpu(past->cookie);
+        cookie = past->cookie;
        flags = be32_to_cpu(past->flags);
+        node = past->node_idx;
        if (locklen > DLM_LOCKID_NAME_MAX) {
                ret = DLM_IVBUFLEN;
-                mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+                mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+                     "handler!\n", locklen);
                goto leave;
        }
        if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
             (LKM_PUT_LVB|LKM_GET_LVB)) {
-                mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+                mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+                     flags);
                ret = DLM_BADARGS;
                goto leave;
        }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        if (past->type != DLM_AST &&
            past->type != DLM_BAST) {
                mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-                     "name=%.*s\n", past->type, 
+                     "name=%.*s, node=%u\n", past->type,
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     locklen, name);
+                     locklen, name, node);
                ret = DLM_IVLOCKID;
                goto leave;
        }
        res = dlm_lookup_lockres(dlm, name, locklen);
        if (!res) {
-                mlog(0, "got %sast for unknown lockres! "
+                mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
-                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
+                     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
-                     past->type == DLM_AST ? "" : "b",
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     locklen, name, node);
-                     locklen, name, locklen);
                ret = DLM_IVLOCKID;
                goto leave;
        }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
-                mlog(0, "responding with DLM_RECOVERING!\n");
+                mlog(0, "Responding with DLM_RECOVERING!\n");
                ret = DLM_RECOVERING;
                goto unlock_out;
        }
        if (res->state & DLM_LOCK_RES_MIGRATING) {
-                mlog(0, "responding with DLM_MIGRATING!\n");
+                mlog(0, "Responding with DLM_MIGRATING!\n");
                ret = DLM_MIGRATING;
                goto unlock_out;
        }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        lock = NULL;
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
-        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
+        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
-             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
+             "node=%u\n", past->type == DLM_AST ? "" : "b",
-             dlm_get_lock_cookie_node(cookie),
+             dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-             dlm_get_lock_cookie_seq(cookie),
+             dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-             locklen, name, locklen);
+             locklen, name, node);
        ret = DLM_NORMAL;
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: adding to granted list... type=%d, "
+                mlog(0, "ast: Adding to granted list... type=%d, "
-                          "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
                dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 leave:
        if (res)
                dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
+        spinlock_t track_lock;
        char *name;
        u8 node_num;
        u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
+        struct dlm_ctxt *dlm;
        unsigned migration_pending:1;
        atomic_t asts_reserved;
        spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
        struct debug_lockres *dl = m->private;
        struct dlm_ctxt *dlm = dl->dl_ctxt;
+        struct dlm_lock_resource *oldres = dl->dl_res;
        struct dlm_lock_resource *res = NULL;
+        struct list_head *track_list;
-        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->track_lock);
+        if (oldres)
+                track_list = &oldres->tracking;
+        else
+                track_list = &dlm->tracking_list;
-        if (dl->dl_res) {
+        list_for_each_entry(res, track_list, tracking) {
-                list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                if (&res->tracking == &dlm->tracking_list)
-                        if (dl->dl_res) {
+                        res = NULL;
-                                dlm_lockres_put(dl->dl_res);
+                else
-                                dl->dl_res = NULL;
-                        }
-                        if (&res->tracking == &dlm->tracking_list) {
-                                mlog(0, "End of list found, %p\n", res);
-                                dl = NULL;
-                                break;
-                        }
                        dlm_lockres_get(res);
-                        dl->dl_res = res;
+                break;
-                        break;
-                }
-        } else {
-                if (!list_empty(&dlm->tracking_list)) {
-                        list_for_each_entry(res, &dlm->tracking_list, tracking)
-                                break;
-                        dlm_lockres_get(res);
-                        dl->dl_res = res;
-                } else
-                        dl = NULL;
        }
+        spin_unlock(&dlm->track_lock);
-        if (dl) {
+        if (oldres)
-                spin_lock(&dl->dl_res->spinlock);
+                dlm_lockres_put(oldres);
-                dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-                spin_unlock(&dl->dl_res->spinlock);
-        }
-        spin_unlock(&dlm->spinlock);
+        dl->dl_res = res;
+        if (res) {
+                spin_lock(&res->spinlock);
+                dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+                spin_unlock(&res->spinlock);
+        } else
+                dl = NULL;
+        /* passed to seq_show */
        return dl;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
+        spin_lock_init(&dlm->track_lock);
        INIT_LIST_HEAD(&dlm->list);
        INIT_LIST_HEAD(&dlm->dirty_list);
        INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
+        struct dlm_ctxt *dlm;
        res = container_of(kref, struct dlm_lock_resource, refs);
+        dlm = res->dlm;
        /* This should not happen -- all lockres' have a name
         * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        spin_lock(&dlm->track_lock);
        if (!list_empty(&res->tracking))
                list_del_init(&res->tracking);
        else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
                     res->lockname.len, res->lockname.name);
                dlm_print_one_lock_resource(res);
        }
+        spin_unlock(&dlm->track_lock);
+        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->migration_pending = 0;
        res->inflight_locks = 0;
+        /* put in dlm_lockres_release */
+        dlm_grab(dlm);
+        res->dlm = dlm;
        kref_init(&res->refs);
        /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
        if (tmpres) {
                int dropping_ref = 0;
+                spin_unlock(&dlm->spinlock);
                spin_lock(&tmpres->spinlock);
+                /* We wait for the other thread that is mastering the resource */
+                if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+                        __dlm_wait_on_lockres(tmpres);
+                        BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+                }
                if (tmpres->owner == dlm->node_num) {
                        BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
                        dlm_lockres_grab_inflight_ref(dlm, tmpres);
                } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
                        dropping_ref = 1;
                spin_unlock(&tmpres->spinlock);
-                spin_unlock(&dlm->spinlock);
                /* wait until done messaging the master, drop our ref to allow
                 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                  struct dlm_node_iter *iter)
 {
        struct dlm_migrate_request migrate;
-        int ret, status = 0;
+        int ret, skip, status = 0;
        int nodenum;
        memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                    nodenum == new_master)
                        continue;
+                /* We could race exit domain. If exited, skip. */
+                spin_lock(&dlm->spinlock);
+                skip = (!test_bit(nodenum, dlm->domain_map));
+                spin_unlock(&dlm->spinlock);
+                if (skip) {
+                        clear_bit(nodenum, iter->node_map);
+                        continue;
+                }
                ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
-                if (ret < 0)
+                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(0, "migrate_request returned %d!\n", ret);
-                else if (status < 0) {
+                        if (!dlm_is_host_down(ret)) {
+                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+                                BUG();
+                        }
+                        clear_bit(nodenum, iter->node_map);
+                        ret = 0;
+                } else if (status < 0) {
                        mlog(0, "migrate request (node %u) returned %d!\n",
                             nodenum, status);
                        ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                spin_lock(&res->spinlock);
                /* This ensures that clear refmap is sent after the set */
-                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+                __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+                                                  DLM_LOCK_RES_MIGRATING));
                spin_unlock(&res->spinlock);
                /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 /*
 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
                                     struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+        .set_lvb        = ocfs2_set_qinfo_lvb,
+        .get_osb        = ocfs2_get_qinfo_osb,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
        return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+        BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+        return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
        if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
        return OCFS2_SB(inode->i_sb);
 }
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+        return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
        struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
        lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info)
+{
+        ocfs2_lock_res_init_once(lockres);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+                              0, lockres->l_name);
+        ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+                                   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+                                   info);
+}
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
        mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+                mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        mlog_entry_void();
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_meta_lvb(0, lockres);
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
        } else {
                /* Boo, we have to go to disk. */
                /* read bh, cast, ocfs2_refresh_inode */
-                status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+                status = ocfs2_read_inode_block(inode, bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                fe = (struct ocfs2_dinode *) (*bh)->b_data;
                /* This is a good chance to make sure we're not
-                 * locking an invalid object.
+                 * locking an invalid object.  ocfs2_read_inode_block()
+                 * already checked that the inode block is sane.
                 *
                 * We bug on a stale inode here because we checked
                 * above whether it was wiped from disk. The wiping
                 * node provides a guarantee that we receive that
                 * message and can mark the inode before dropping any
                 * locks associated with it. */
-                if (!OCFS2_IS_VALID_DINODE(fe)) {
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                        status = -EIO;
-                        goto bail_refresh;
-                }
                mlog_bug_on_msg(inode->i_generation !=
                                le32_to_cpu(fe->i_generation),
                                "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
                return 0;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+        status = ocfs2_read_inode_block(inode, ret_bh);
        if (status < 0)
                mlog_errno(status);
@@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+        mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_qinfo_lvb *lvb;
+        struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        mlog_entry_void();
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+        lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+        lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+        lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+        lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+        lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+        lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+        mlog_exit_void();
+}
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        mlog_entry_void();
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres, level);
+        mlog_exit_void();
+}
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        struct buffer_head *bh = NULL;
+        struct ocfs2_global_disk_dqinfo *gdinfo;
+        int status = 0;
+        if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+                info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+                info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+                oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+                oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+                oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        be32_to_cpu(lvb->lvb_free_entry);
+        } else {
+                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                gdinfo = (struct ocfs2_global_disk_dqinfo *)
+                                        (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+                info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+                info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+                oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+                oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+                oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        le32_to_cpu(gdinfo->dqi_free_entry);
+                brelse(bh);
+                ocfs2_track_lock_refresh(lockres);
+        }
+bail:
+        return status;
+}
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        int status = 0;
+        mlog_entry_void();
+        /* On RO devices, locking really isn't needed... */
+        if (ocfs2_is_hard_readonly(osb)) {
+                if (ex)
+                        status = -EROFS;
+                goto bail;
+        }
+        if (ocfs2_mount_local(osb))
+                goto bail;
+        status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        if (!ocfs2_should_refresh_lock_res(lockres))
+                goto bail;
+        /* OK, we have the lock but we need to refresh the quota info */
+        status = ocfs2_refresh_qinfo(oinfo);
+        if (status)
+                ocfs2_qinfo_unlock(oinfo, ex);
+        ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+        mlog_exit(status);
+        return status;
+}
 /*
 * This is the filesystem locking protocol.  It provides the lock handling
 * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
        __be32       lvb_reserved2;
 };
+#define OCFS2_QINFO_LVB_VERSION 1
+struct ocfs2_qinfo_lvb {
+        __u8    lvb_version;
+        __u8    lvb_reserved[3];
+        __be32  lvb_bgrace;
+        __be32  lvb_igrace;
+        __be32  lvb_syncms;
+        __be32  lvb_blocks;
+        __be32  lvb_free_blk;
+        __be32  lvb_free_entry;
+};
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+        ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                ret = -EROFS;
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                goto out;
-        }
        if (el->l_tree_depth) {
                ocfs2_error(inode->i_sb,
                            "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
                        goto no_more_extents;
-                ret = ocfs2_read_block(inode,
+                ret = ocfs2_read_extent_block(inode,
-                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                              le64_to_cpu(eb->h_next_leaf_blk),
-                                       &next_eb_bh);
+                                              &next_eb_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                        ret = -EROFS;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-                        goto out;
-                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
                el = &next_eb->h_list;
                i = ocfs2_search_for_hole_index(el, v_cluster);
        }
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
        if (ret == 0)
                goto out;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -819,3 +806,74 @@ out:
        return ret;
 }
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh))
+{
+        int rc = 0;
+        u64 p_block, p_count;
+        int i, count, done = 0;
+        mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+                   "flags = %x, validate = %p)\n",
+                   inode, (unsigned long long)v_block, nr, bhs, flags,
+                   validate);
+        if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+            i_size_read(inode)) {
+                BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+                goto out;
+        }
+        while (done < nr) {
+                down_read(&OCFS2_I(inode)->ip_alloc_sem);
+                rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+                                                 &p_block, &p_count, NULL);
+                up_read(&OCFS2_I(inode)->ip_alloc_sem);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!p_block) {
+                        rc = -EIO;
+                        mlog(ML_ERROR,
+                             "Inode #%llu contains a hole at offset %llu\n",
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             (unsigned long long)(v_block + done) <<
+                             inode->i_sb->s_blocksize_bits);
+                        break;
+                }
+                count = nr - done;
+                if (p_count < count)
+                        count = p_count;
+                /*
+                 * If the caller passed us bhs, they should have come
+                 * from a previous readahead call to this function.  Thus,
+                 * they should have the right b_blocknr.
+                 */
+                for (i = 0; i < count; i++) {
+                        if (!bhs[done + i])
+                                continue;
+                        BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+                }
+                rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+                                       flags, validate);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                done += count;
+        }
+out:
+        mlog_exit(rc);
+        return rc;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                             u32 *p_cluster, u32 *num_clusters,
                             struct ocfs2_extent_list *el);
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+                                        struct buffer_head **bh,
+                                        int (*validate)(struct super_block *sb,
+                                                        struct buffer_head *bh))
+{
+        int status = 0;
+        if (bh == NULL) {
+                printk("ocfs2: bh == NULL\n");
+                status = -EINVAL;
+                goto bail;
+        }
+        status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+bail:
+        return status;
+}
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -56,6 +57,8 @@
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
+#include "acl.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -303,9 +306,9 @@ bail:
        return status;
 }
-static int ocfs2_simple_size_update(struct inode *inode,
+int ocfs2_simple_size_update(struct inode *inode,
-                                    struct buffer_head *di_bh,
+                             struct buffer_head *di_bh,
-                                    u64 new_i_size)
+                             u64 new_i_size)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
                   (unsigned long long)new_i_size);
+        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
         */
        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto leave;
-        }
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
        }
 restarted_transaction:
+        if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+            clusters_to_add))) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota = 1;
        /* reserve a write to the file entry early on - that we if we
         * run out of credits in the allocation path, we can still
         * update i_size. */
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /* Release unused quota reservation */
+        vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+        did_quota = 0;
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (handle) {
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
+        int locked[MAXQUOTAS] = {0, 0};
+        int credits, qtype;
+        struct ocfs2_mem_dqinfo *oinfo;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-        if (IS_ERR(handle)) {
+            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                status = PTR_ERR(handle);
+                credits = OCFS2_INODE_UPDATE_CREDITS;
-                mlog_errno(status);
+                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
-                goto bail_unlock;
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                        oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                                ocfs2_calc_qdel_credits(sb, USRQUOTA);
+                        locked[USRQUOTA] = 1;
+                }
+                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                        oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+                                   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+                        locked[GRPQUOTA] = 1;
+                }
+                handle = ocfs2_start_trans(osb, credits);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
+                status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                if (status < 0)
+                        goto bail_commit;
+        } else {
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
        }
        /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+                if (!locked[qtype])
+                        continue;
+                oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+                ocfs2_unlock_global_qf(oinfo, 1);
+        }
        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
 bail:
        brelse(bh);
+        if (!status && attr->ia_valid & ATTR_MODE) {
+                status = ocfs2_acl_chmod(inode);
+                if (status < 0)
+                        mlog_errno(status);
+        }
        mlog_exit(status);
        return status;
 }
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
        struct buffer_head *bh = NULL;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+        ret = ocfs2_read_inode_block(inode, &bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
        struct buffer_head *di_bh = NULL;
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+                ret = ocfs2_read_inode_block(inode, &di_bh);
-                                       &di_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1226,83 +1284,6 @@ out:
        return ret;
 }
-static int __ocfs2_remove_inode_range(struct inode *inode,
-                                      struct buffer_head *di_bh,
-                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-        int ret;
-        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_extent_tree et;
-        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        OCFS2_I(inode)->ip_clusters -= len;
-        di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-        if (ret)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        return ret;
-}
 /*
 * Truncate a byte range, avoiding pages within partial clusters. This
 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
        if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                /* Only do work for non-holes */
                if (phys_cpos != 0) {
-                        ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
-                                                         phys_cpos, alloc_size,
+                                                       phys_cpos, alloc_size,
-                                                         &dealloc);
+                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
                            struct ocfs2_space_resv *sr)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
            !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+                             struct buffer_head *di_bh,
+                             u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
                          u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include <asm/byteorder.h>
@@ -37,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        return 0;
 }
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino)
+                          int create_ino)
 {
        struct super_block *sb;
        struct ocfs2_super *osb;
-        int status = -EINVAL;
        int use_plocks = 1;
        mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
            ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
                use_plocks = 0;
-        /* this means that read_inode cannot create a superblock inode
+        /*
-         * today.  change if needed. */
+         * These have all been checked by ocfs2_read_inode_block() or set
-        if (!OCFS2_IS_VALID_DINODE(fe) ||
+         * by ocfs2_mknod_locked(), so a failure is a code bug.
-            !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+         */
-                mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
-                     "signature = %.*s, flags = 0x%x\n",
+                                                cannot create a superblock
-                     inode->i_ino,
+                                                inode today.  change if
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                                                that is needed. */
-                     fe->i_signature, le32_to_cpu(fe->i_flags));
+        BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
-                goto bail;
+        BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
-        }
-        if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-                mlog(ML_ERROR, "file entry generation does not match "
-                     "superblock! osb->fs_generation=%x, "
-                     "fe->i_fs_generation=%x\n",
-                     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-                goto bail;
-        }
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
-        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+                inode->i_flags |= S_NOQUOTA;
+        }
        if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
                mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
        } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+        } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+                inode->i_flags |= S_NOQUOTA;
        } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
                mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
                /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
-        status = 0;
+        mlog_exit_void();
-bail:
-        mlog_exit(status);
-        return status;
 }
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
-        if (can_lock)
+        if (can_lock) {
-                status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+                status = ocfs2_read_inode_block_full(inode, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                                     OCFS2_BH_IGNORE_CACHE);
-        else
+        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+                if (!status)
+                        status = ocfs2_validate_inode_block(osb->sb, bh);
+        }
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)args->fi_blkno, 7,
-                     fe->i_signature);
-                goto bail;
-        }
        /*
         * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
            S_ISBLK(le16_to_cpu(fe->i_mode)))
-                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
-        if (ocfs2_populate_inode(inode, fe, 0) < 0)
+        ocfs2_populate_inode(inode, fe, 0);
-                goto bail;
        BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                        goto out;
                }
-                status = ocfs2_journal_access(handle, inode, fe_bh,
+                status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+                                        ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        /* set the inodes dtime */
-        status = ocfs2_journal_access(handle, inode, di_bh,
+        status = ocfs2_journal_access_di(handle, inode, di_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        ocfs2_remove_from_cache(inode, di_bh);
+        vfs_dq_free_inode(inode);
        status = ocfs2_free_dinode(handle, inode_alloc_inode,
                                   inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        if (is_bad_inode(inode)) {
+        /* When we fail in read_inode() we mark inode as bad. The second test
+         * catches the case when inode allocation fails before allocating
+         * a block for inode. */
+        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
                mlog(0, "Skipping delete of bad inode\n");
                goto bail;
        }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        mlog_entry("(inode %llu)\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        mlog(0, "Validating dinode %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        rc = -EINVAL;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            di->i_signature);
+                goto bail;
+        }
+        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                goto bail;
+        }
+        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+                            (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        if (le32_to_cpu(di->i_fs_generation) !=
+            OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: fs_generation is %u\n",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(di->i_fs_generation));
+                goto bail;
+        }
+        rc = 0;
+bail:
+        return rc;
+}
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+                               flags, ocfs2_validate_inode_block);
+        /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+        return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino);
+                          int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
                           struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+                                int block, int *err, int reada);
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -45,6 +46,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num);
+                              int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 0);
+}
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 1);
+}
 /*
 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
        BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
        BUG_ON(max_buffs <= 0);
-        /* JBD might support this, but our journalling code doesn't yet. */
+        /* Nested transaction? Just return the handle... */
-        if (journal_current_handle()) {
+        if (journal_current_handle())
-                mlog(ML_ERROR, "Recursive transaction attempted!\n");
+                return jbd2_journal_start(journal, max_buffs);
-                BUG();
-        }
        down_read(&osb->journal->j_trans_barrier);
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
                       handle_t *handle)
 {
-        int ret;
+        int ret, nested;
        struct ocfs2_journal *journal = osb->journal;
        BUG_ON(!handle);
+        nested = handle->h_ref > 1;
        ret = jbd2_journal_stop(handle);
        if (ret < 0)
                mlog_errno(ret);
-        up_read(&journal->j_trans_barrier);
+        if (!nested)
+                up_read(&journal->j_trans_barrier);
        return ret;
 }
@@ -357,10 +370,137 @@ bail:
        return status;
 }
-int ocfs2_journal_access(handle_t *handle,
+struct ocfs2_triggers {
-                         struct inode *inode,
+        struct jbd2_buffer_trigger_type ot_triggers;
-                         struct buffer_head *bh,
+        int                             ot_offset;
-                         int type)
+};
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_dir_trailer_from_size(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                struct buffer_head *bh)
+{
+        mlog(ML_ERROR,
+             "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+             "bh->b_blocknr = %llu\n",
+             (unsigned long)bh,
+             (unsigned long long)bh->b_blocknr);
+        /* We aren't guaranteed to have the superblock here - but if we
+         * don't, it'll just crash. */
+        ocfs2_error(bh->b_assoc_map->host->i_sb,
+                    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+static struct ocfs2_triggers di_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
+};
+static struct ocfs2_triggers eb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
+};
+static struct ocfs2_triggers gd_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
+};
+static struct ocfs2_triggers db_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_db_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static struct ocfs2_triggers xb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+static struct ocfs2_triggers dq_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_dq_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static int __ocfs2_journal_access(handle_t *handle,
+                                  struct inode *inode,
+                                  struct buffer_head *bh,
+                                  struct ocfs2_triggers *triggers,
+                                  int type)
 {
        int status;
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
                status = -EINVAL;
                mlog(ML_ERROR, "Uknown access type!\n");
        }
+        if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
        if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
        return status;
 }
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+                               struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+                                      type);
+}
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+                                      type);
+}
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+                                      type);
+}
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+                                      type);
+}
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
 int ocfs2_journal_dirty(handle_t *handle,
                        struct buffer_head *bh)
 {
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
        return status;
 }
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-                             struct buffer_head *bh)
-{
-        int err = journal_dirty_data(handle, bh);
-        if (err)
-                mlog_errno(err);
-        /* TODO: When we can handle it, abort the handle and go RO on
-         * error here. */
-        return err;
-}
-#endif
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        mlog_entry_void();
        fe = (struct ocfs2_dinode *)bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                /* This is called from startup/shutdown which will
+        /* The journal bh on the osb always comes from ocfs2_journal_init()
-                 * handle the errors in a specific manner, so no need
+         * and was validated there inside ocfs2_inode_lock_full().  It's a
-                 * to call ocfs2_error() here. */
+         * code bug if we mess it up. */
-                mlog(ML_ERROR, "Journal dinode %llu  has invalid "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-                     "signature: %.*s",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                status = -EIO;
-                goto out;
-        }
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
        if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        if (replayed)
                ocfs2_bump_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
-out:
        mlog_exit(status);
        return status;
 }
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
        int                     lri_slot;
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
+        struct ocfs2_quota_recovery *lri_qrec;
 };
 /* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_super *osb = journal->j_osb;
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
+        struct ocfs2_quota_recovery *qrec;
        LIST_HEAD(tmp_la_list);
        mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+                ocfs2_wait_on_quotas(osb);
                la_dinode = item->lri_la_dinode;
                if (la_dinode) {
                        mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
                if (ret < 0)
                        mlog_errno(ret);
+                qrec = item->lri_qrec;
+                if (qrec) {
+                        mlog(0, "Recovering quota files");
+                        ret = ocfs2_finish_quota_recovery(osb, qrec,
+                                                          item->lri_slot);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                        /* Recovery info is already freed now */
+                }
                kfree(item);
        }
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
-                                            struct ocfs2_dinode *tl_dinode)
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec)
 {
        struct ocfs2_la_recovery_item *item;
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                if (tl_dinode)
                        kfree(tl_dinode);
+                if (qrec)
+                        ocfs2_free_quota_recovery(qrec);
                mlog_errno(-ENOMEM);
                return;
        }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_la_dinode = la_dinode;
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
+        item->lri_qrec = qrec;
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
                ocfs2_queue_recovery_completion(journal,
                                                osb->slot_num,
                                                osb->local_alloc_copy,
+                                                NULL,
                                                NULL);
                ocfs2_schedule_truncate_log_flush(osb, 0);
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        }
 }
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+        if (osb->quota_rec) {
+                ocfs2_queue_recovery_completion(osb->journal,
+                                                osb->slot_num,
+                                                NULL,
+                                                NULL,
+                                                osb->quota_rec);
+                osb->quota_rec = NULL;
+        }
+}
 static int __ocfs2_recovery_thread(void *arg)
 {
-        int status, node_num;
+        int status, node_num, slot_num;
        struct ocfs2_super *osb = arg;
        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        int *rm_quota = NULL;
+        int rm_quota_used = 0, i;
+        struct ocfs2_quota_recovery *qrec;
        mlog_entry_void();
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
                goto bail;
        }
+        rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+        if (!rm_quota) {
+                status = -ENOMEM;
+                goto bail;
+        }
 restart:
        status = ocfs2_super_lock(osb, 1);
        if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
                 * clear it until ocfs2_recover_node() has succeeded. */
                node_num = rm->rm_entries[0];
                spin_unlock(&osb->osb_lock);
+                mlog(0, "checking node %d\n", node_num);
-                status = ocfs2_recover_node(osb, node_num);
+                slot_num = ocfs2_node_num_to_slot(osb, node_num);
+                if (slot_num == -ENOENT) {
+                        status = 0;
+                        mlog(0, "no slot for this node, so no recovery"
+                             "required.\n");
+                        goto skip_recovery;
+                }
+                mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+                /* It is a bit subtle with quota recovery. We cannot do it
+                 * immediately because we have to obtain cluster locks from
+                 * quota files and we also don't want to just skip it because
+                 * then quota usage would be out of sync until some node takes
+                 * the slot. So we remember which nodes need quota recovery
+                 * and when everything else is done, we recover quotas. */
+                for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+                if (i == rm_quota_used)
+                        rm_quota[rm_quota_used++] = slot_num;
+                status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
                if (!status) {
                        ocfs2_recovery_map_clear(osb, node_num);
                } else {
@@ -1055,13 +1285,27 @@ restart:
        if (status < 0)
                mlog_errno(status);
+        /* Now it is right time to recover quotas... We have to do this under
+         * superblock lock so that noone can start using the slot (and crash)
+         * before we recover it */
+        for (i = 0; i < rm_quota_used; i++) {
+                qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+                if (IS_ERR(qrec)) {
+                        status = PTR_ERR(qrec);
+                        mlog_errno(status);
+                        continue;
+                }
+                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+                                                NULL, NULL, qrec);
+        }
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
         * node(s) may have disallowd a previos inode delete. Re-processing
         * is therefore required. */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL);
+                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
        mutex_unlock(&osb->recovery_lock);
+        if (rm_quota)
+                kfree(rm_quota);
        mlog_exit(status);
        /* no one is callint kthread_stop() for us so the kthread() api
         * requires that we call do_exit().  And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+        status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
-                                   OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        osb->slot_recovery_generations[slot_num] =
                                        ocfs2_get_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
 * far less concerning.
 */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num)
+                              int node_num, int slot_num)
 {
        int status = 0;
-        int slot_num;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
-        mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+        mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-                   node_num, osb->node_num);
+                   node_num, slot_num, osb->node_num);
-        mlog(0, "checking node %d\n", node_num);
        /* Should not ever be called to recover ourselves -- in that
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
-        slot_num = ocfs2_node_num_to_slot(osb, node_num);
-        if (slot_num == -ENOENT) {
-                status = 0;
-                mlog(0, "no slot for this node, so no recovery required.\n");
-                goto done;
-        }
-        mlog(0, "node %d was using slot %d\n", node_num, slot_num);
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
                if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                        tl_copy);
+                                        tl_copy, NULL);
        status = 0;
 done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        return ret;
 }
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
        /* This check is good because ocfs2 will wait on our recovery
         * thread before changing it to something other than MOUNTED
         * or DISABLED. */
        wait_event(osb->osb_mount_event,
-                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+                  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
                   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
        /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
+#include <linux/jbd2.h>
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
 enum ocfs2_journal_state {
        OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
 *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
 *                          commit the handle to disk in the process, but will
 *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
 *                          buffer. Will have to call ocfs2_journal_dirty once
 *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
 *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
 *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
 *                           the current handle commits.
@@ -248,10 +247,29 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
-int                  ocfs2_journal_access(handle_t *handle,
-                                          struct inode *inode,
+/* ocfs2_inode */
-                                          struct buffer_head *bh,
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
-                                          int type);
+                            struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type);
 /*
 * A word about the journal_access/journal_dirty "dance". It is
 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int                  ocfs2_journal_access(handle_t *handle,
 */
 int                  ocfs2_journal_dirty(handle_t *handle,
                                         struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-                                              struct buffer_head *bh);
-#endif
 /*
 *  Credit Macros:
@@ -293,6 +307,37 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+        int credits = 0;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        return credits;
+}
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -303,8 +348,11 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC           \
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
-                                         + OCFS2_INODE_UPDATE_CREDITS)
+{
+        return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -313,16 +361,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE                 \
                                         + OCFS2_TRUNCATE_LOG_UPDATE)
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+        return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
+ * group descriptor + mkdir/symlink blocks + quota update */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+static inline int ocfs2_mknod_credits(struct super_block *sb)
-                            + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+{
+        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -332,13 +387,21 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+               ocfs2_quota_trans_credits(sb);
+}
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
 * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+static inline int ocfs2_unlink_credits(struct super_block *sb)
-                              + OCFS2_LINK_CREDITS)
+{
+        /* The quota update from ocfs2_link_credits is unused here... */
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
 * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+static inline int ocfs2_rename_credits(struct super_block *sb)
-                             + OCFS2_UNLINK_CREDITS)
+{
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 /* global bitmap dinode, group desc., relinked group,
 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
         * credit for the dinode there. */
        extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
-        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+               ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = OCFS2_MKNOD_CREDITS;
+        int blocks = ocfs2_mknod_credits(sb);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
        blocks += ocfs2_clusters_to_blocks(sb, 1);
-        return blocks;
+        return blocks + ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* update to the truncate log. */
        credits += OCFS2_TRUNCATE_LOG_UPDATE;
+        credits += ocfs2_quota_trans_credits(sb);
        return credits;
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        memcpy(alloc_copy, alloc, bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
        status = ocfs2_write_block(osb, alloc_bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
         * delete bits from it! */
        *num_bits = bits_wanted;
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -61,17 +62,18 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "acl.h"
 #include "buffer_head_io.h"
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac);
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
        return ret;
 }
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+        struct inode *inode;
+        inode = new_inode(dir->i_sb);
+        if (!inode) {
+                mlog(ML_ERROR, "new_inode failed!\n");
+                return NULL;
+        }
+        /* populate as many fields early on as possible - many of
+         * these are used by the support functions here and in
+         * callers. */
+        if (S_ISDIR(mode))
+                inode->i_nlink = 2;
+        else
+                inode->i_nlink = 1;
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        vfs_dq_init(inode);
+        return inode;
+}
 static int ocfs2_mknod(struct inode *dir,
                       struct dentry *dentry,
                       int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        /* Reserve a cluster if creating an extent based directory. */
+        inode = ocfs2_get_init_inode(dir, mode);
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
-                if (status < 0) {
+                mlog_errno(status);
-                        if (status != -ENOSPC)
+                goto leave;
-                                mlog_errno(status);
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
                        goto leave;
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+        /* calculate meta data/clusters for setting security and acl xattr */
+        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+                                        &si, &want_clusters,
+                                        &xattr_credits, &xattr_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        /* Reserve a cluster if creating an extent based directory. */
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+                                   xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+         * to be called. */
+        if (sb_any_quota_active(osb->sb) &&
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota_inode = 1;
        /* do the real work now. */
-        status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
                                    &new_fe_bh, parent_fe_bh, handle,
-                                    &inode, inode_ac);
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
-                status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+                status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
                inc_nlink(dir);
        }
+        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+                                xattr_ac, data_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto leave;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
                                 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
        d_instantiate(dentry, inode);
        status = 0;
 leave:
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -331,9 +425,13 @@ leave:
        brelse(new_fe_bh);
        brelse(de_bh);
        brelse(parent_fe_bh);
+        kfree(si.name);
+        kfree(si.value);
-        if ((status < 0) && inode)
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
        mlog_exit(status);
        return status;
@@ -348,12 +449,12 @@ leave:
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac)
 {
        int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
-        struct inode *inode = NULL;
-        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-                   (unsigned long)dev, dentry->d_name.len,
+                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
                   dentry->d_name.name);
        *new_fe_bh = NULL;
-        *ret_inode = NULL;
        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
                                       &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        inode = new_inode(dir->i_sb);
-        if (!inode) {
-                status = -ENOMEM;
-                mlog(ML_ERROR, "new_inode failed!\n");
-                goto leave;
-        }
        /* populate as many fields early on as possible - many of
         * these are used by the support functions here and in
         * callers. */
        inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
        OCFS2_I(inode)->ip_blkno = fe_blkno;
-        if (S_ISDIR(mode))
-                inode->i_nlink = 2;
-        else
-                inode->i_nlink = 1;
-        inode->i_mode = mode;
        spin_lock(&osb->osb_lock);
        inode->i_generation = osb->s_next_generation++;
        spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        }
        ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
-        status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-        fe->i_uid = cpu_to_le32(current_fsuid());
+        fe->i_uid = cpu_to_le32(inode->i_uid);
-        if (dir->i_mode & S_ISGID) {
+        fe->i_gid = cpu_to_le32(inode->i_gid);
-                fe->i_gid = cpu_to_le32(dir->i_gid);
+        fe->i_mode = cpu_to_le16(inode->i_mode);
-                if (S_ISDIR(mode))
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-                        mode |= S_ISGID;
-        } else
-                fe->i_gid = cpu_to_le32(current_fsgid());
-        fe->i_mode = cpu_to_le16(mode);
-        if (S_ISCHR(mode) || S_ISBLK(mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
        fe->i_links_count = cpu_to_le16(inode->i_nlink);
        fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        /*
         * If supported, directories start with inline data.
         */
-        if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+        ocfs2_populate_inode(inode, fe, 1);
-                mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-                     "i_blkno=%llu, i_ino=%lu\n",
-                     (unsigned long long)(*new_fe_bh)->b_blocknr,
-                     (unsigned long long)le64_to_cpu(fe->i_blkno),
-                     inode->i_ino);
-                BUG();
-        }
        ocfs2_inode_set_new(osb, inode);
        if (!ocfs2_mount_local(osb)) {
                status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        status = 0; /* error in ocfs2_create_new_inode_locks is not
                     * critical */
-        *ret_inode = inode;
 leave:
        if (status < 0) {
                if (*new_fe_bh) {
                        brelse(*new_fe_bh);
                        *new_fe_bh = NULL;
                }
-                if (inode) {
-                        clear_nlink(inode);
-                        iput(inode);
-                }
        }
        mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        err = ocfs2_journal_access(handle, inode, fe_bh,
+        err = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
                mlog_errno(err);
                goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
                                goto bail;
                        }
                }
-                status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+                status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
        old_inode->i_ctime = CURRENT_TIME;
        mark_inode_dirty(old_inode);
-        status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
+        status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status >= 0) {
                old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
                             (int)old_dir_nlink, old_dir->i_nlink);
                } else {
                        struct ocfs2_dinode *fe;
-                        status = ocfs2_journal_access(handle, old_dir,
+                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                      old_dir_bh,
+                                                         old_dir_bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota = 0, did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        /* don't reserve bitmap space for fast symlinks. */
+        inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
-        if (l > ocfs2_fast_symlink_chars(sb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        /* calculate meta data/clusters for setting security xattr */
+        if (si.enable) {
+                status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+                                                  &xattr_credits, &xattr_ac);
                if (status < 0) {
-                        if (status != -ENOSPC)
+                        mlog_errno(status);
-                                mlog_errno(status);
                        goto bail;
                }
        }
-        handle = ocfs2_start_trans(osb, credits);
+        /* don't reserve bitmap space for fast symlinks. */
+        if (l > ocfs2_fast_symlink_chars(sb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        handle = ocfs2_start_trans(osb, credits + xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        status = ocfs2_mknod_locked(osb, dir, dentry,
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
-                                    S_IFLNK | S_IRWXUGO, 0,
+         * to be called. */
-                                    &new_fe_bh, parent_fe_bh, handle,
+        if (sb_any_quota_active(osb->sb) &&
-                                    &inode, inode_ac);
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto bail;
+        }
+        did_quota_inode = 1;
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+                                    0, &new_fe_bh, parent_fe_bh, handle,
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                              new_fe_bh,
                                              handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
                                 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                        ocfs2_clusters_to_bytes(osb->sb, 1));
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -1640,12 +1772,18 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
        brelse(de_bh);
+        kfree(si.name);
+        kfree(si.value);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if ((status < 0) && inode)
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        mlog_exit(status);
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        status = ocfs2_read_block(orphan_dir_inode,
+        status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
-                                  OCFS2_I(orphan_dir_inode)->ip_blkno,
-                                  &orphan_dir_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                goto leave;
        }
-        status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..ad5c24a29edd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
        VOLUME_INIT = 0,
        VOLUME_MOUNTED,
+        VOLUME_MOUNTED_QUOTAS,
        VOLUME_DISMOUNTED,
        VOLUME_DISABLED
 };
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
+        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+        OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
        char *local_alloc_debug_buf;
 #endif
-        /* Next two fields are for local node slot recovery during
+        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
        struct ocfs2_dinode *local_alloc_copy;
+        struct ocfs2_quota_recovery *quota_rec;
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+                                         struct buffer_head *bh, int type);
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
        if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)                                      \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)  do {                    \
-        typeof(__di) ____di = (__di);                                   \
-        ocfs2_error((__sb),                                             \
-                "Dinode # %llu has bad signature %.*s",                 \
-                (unsigned long long)le64_to_cpu((____di)->i_blkno), 7,  \
-                (____di)->i_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)                                \
        (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)    do {            \
-        typeof(__eb) ____eb = (__eb);                                   \
-        ocfs2_error((__sb),                                             \
-                "Extent Block # %llu has bad signature %.*s",           \
-                (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,  \
-                (____eb)->h_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)                                  \
        (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)      do {            \
-        typeof(__gd) ____gd = (__gd);                                   \
-                ocfs2_error((__sb),                                     \
-                "Group Descriptor # %llu has bad signature %.*s",       \
-                (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-                (____gd)->bg_signature);                                \
-} while (0)
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)                                 \
        (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
+        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE    "EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -93,8 +94,11 @@
                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-                                         | OCFS2_FEATURE_INCOMPAT_XATTR)
+                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-#define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -147,6 +151,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -163,6 +170,12 @@
 */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -192,6 +205,7 @@
 #define OCFS2_HEARTBEAT_FL      (0x00000200)    /* Heartbeat area */
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
+#define OCFS2_QUOTA_FL          (0x00001000)    /* Quota file */
 /*
 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
        HEARTBEAT_SYSTEM_INODE,
        GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+        USER_QUOTA_SYSTEM_INODE,
+        GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
        ORPHAN_DIR_SYSTEM_INODE,
        EXTENT_ALLOC_SYSTEM_INODE,
        INODE_ALLOC_SYSTEM_INODE,
        JOURNAL_SYSTEM_INODE,
        LOCAL_ALLOC_SYSTEM_INODE,
        TRUNCATE_LOG_SYSTEM_INODE,
+        LOCAL_USER_QUOTA_SYSTEM_INODE,
+        LOCAL_GROUP_QUOTA_SYSTEM_INODE,
        NUM_SYSTEM_INODES
 };
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [SLOT_MAP_SYSTEM_INODE]                 = { "slot_map", 0, S_IFREG | 0644 },
        [HEARTBEAT_SYSTEM_INODE]                = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
        [GLOBAL_BITMAP_SYSTEM_INODE]            = { "global_bitmap", 0, S_IFREG | 0644 },
+        [USER_QUOTA_SYSTEM_INODE]               = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [GROUP_QUOTA_SYSTEM_INODE]              = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
        /* Slot-specific system inodes (one copy per slot) */
        [ORPHAN_DIR_SYSTEM_INODE]               = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [INODE_ALLOC_SYSTEM_INODE]              = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
        [JOURNAL_SYSTEM_INODE]                  = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
        [LOCAL_ALLOC_SYSTEM_INODE]              = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+        [LOCAL_USER_QUOTA_SYSTEM_INODE]         = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [LOCAL_GROUP_QUOTA_SYSTEM_INODE]        = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 /* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 #define OCFS2_RAW_SB(dinode)            (&((dinode)->id2.i_super))
 /*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/  __le32 bc_crc32e;       /* 802.3 Ethernet II CRC32 */
+        __le16 bc_ecc;          /* Single-error-correction parity vector.
+                                   This is a simple Hamming code dependant
+                                   on the blocksize.  OCFS2's maximum
+                                   blocksize, 4K, requires 16 parity bits,
+                                   so we fit in __le16. */
+        __le16 bc_reserved1;
+/*08*/
+};
+/*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 {
 /*00*/  __u8 h_signature[8];            /* Signature for verification */
-        __le64 h_reserved1;
+        struct ocfs2_block_check h_check;       /* Error checking */
 /*10*/  __le16 h_suballoc_slot;         /* Slot suballocator this
                                           extent_header belongs to */
        __le16 h_suballoc_bit;          /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
                                           was set in i_flags */
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
-/*80*/  __le64 i_reserved2[7];
+/*80*/  struct ocfs2_block_check i_check;       /* Error checking */
+/*88*/  __le64 i_reserved2[6];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
 } __attribute__ ((packed));
 /*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/  __le64          db_compat_inode;        /* Always zero. Was inode */
+        __le16          db_compat_rec_len;      /* Backwards compatible with
+                                                 * ocfs2_dir_entry. */
+        __u8            db_compat_name_len;     /* Always zero. Was name_len */
+        __u8            db_reserved0;
+        __le16          db_reserved1;
+        __le16          db_free_rec_len;        /* Size of largest empty hole
+                                                 * in this block. (unused) */
+/*10*/  __u8            db_signature[8];        /* Signature for verification */
+        __le64          db_reserved2;
+        __le64          db_free_next;           /* Next block in list (unused) */
+/*20*/  __le64          db_blkno;               /* Offset on disk, in blocks */
+        __le64          db_parent_dinode;       /* dinode which owns me, in
+                                                   blocks */
+/*30*/  struct ocfs2_block_check db_check;      /* Error checking */
+/*40*/
+};
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
 /*20*/  __le64   bg_parent_dinode;       /* dinode which owns me, in
                                           blocks */
        __le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/  __le64   bg_reserved2[2];
+/*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
+        __le64   bg_reserved2;
 /*40*/  __u8    bg_bitmap[0];
 };
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
                                                   in this extent record,
                                                   only valid in the first
                                                   bucket. */
-        __le64  xh_csum;
+        struct ocfs2_block_check xh_check;      /* Error checking
+                                                   (Note, this is only
+                                                    used for xattr
+                                                    buckets.  A block uses
+                                                    xb_check and sets
+                                                    this field to zero.) */
        struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
                                        block group */
        __le32  xb_fs_generation;    /* Must match super block */
 /*10*/  __le64  xb_blkno;            /* Offset on disk, in blocks */
-        __le64  xb_csum;
+        struct ocfs2_block_check xb_check;      /* Error checking */
 /*20*/  __le16  xb_flags;            /* Indicates whether this block contains
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
        return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
+/*
+ *  On disk structures for global quota file
+ */
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+        0x0cf52470, /* USRQUOTA */ \
+        0x0cf52471  /* GRPQUOTA */ \
+}
+#define OCFS2_GLOBAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* Quota format version */
+};
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/  __le32 dqi_bgrace;      /* Grace time for space softlimit excess */
+        __le32 dqi_igrace;      /* Grace time for inode softlimit excess */
+        __le32 dqi_syncms;      /* Time after which we sync local changes to
+                                 * global quota file */
+        __le32 dqi_blocks;      /* Number of blocks in quota file */
+/*10*/  __le32 dqi_free_blk;    /* First free block in quota file */
+        __le32 dqi_free_entry;  /* First block with free dquot entry in quota
+                                 * file */
+};
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/  __le32 dqb_id;          /* ID the structure belongs to */
+        __le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+        __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/  __le64 dqb_isoftlimit;  /* preferred inode limit */
+        __le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/  __le64 dqb_bhardlimit;  /* absolute limit on disk space */
+        __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/  __le64 dqb_curspace;    /* current space occupied */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/  __le64 dqb_itime;       /* time limit for excessive inode use */
+        __le64 dqb_pad1;
+/*50*/  __le64 dqb_pad2;
+};
+/*
+ *  On-disk structures for local quota file
+ */
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+        0x0cf524c0, /* USRQUOTA */ \
+        0x0cf524c1  /* GRPQUOTA */ \
+}
+#define OCFS2_LOCAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN      0x0001  /* Quota file is empty (this should be after\
+                                 * quota has been cleanly turned off) */
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+        __le32 dqi_flags;       /* Flags for quota file */
+        __le32 dqi_chunks;      /* Number of chunks of quota structures
+                                 * with a bitmap */
+        __le32 dqi_blocks;      /* Number of blocks allocated for quota file */
+};
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+        __le32 dqc_free;        /* Number of free entries in the bitmap */
+        u8 dqc_bitmap[0];       /* Bitmap of entries in the corresponding
+                                 * chunk of quota file */
+};
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/  __le64 dqb_id;          /* id this quota applies to */
+        __le64 dqb_spacemod;    /* Change in the amount of used space */
+/*10*/  __le64 dqb_inodemod;    /* Change in the amount of used inodes */
+};
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+struct ocfs2_disk_dqtrailer {
+/*00*/  struct ocfs2_block_check dq_check;      /* Error checking */
+/*08*/  /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+                                                                 void *buf)
+{
+        char *ptr = buf;
+        ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+        return (struct ocfs2_disk_dqtrailer *)ptr;
+}
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-struct jbd2_inode {
-        unsigned int dummy;
-};
-#define JBD2_BARRIER                    JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE     JBD_DEFAULT_MAX_COMMIT_AGE
-#define jbd2_journal_ack_err                    journal_ack_err
-#define jbd2_journal_clear_err                  journal_clear_err
-#define jbd2_journal_destroy                    journal_destroy
-#define jbd2_journal_dirty_metadata             journal_dirty_metadata
-#define jbd2_journal_errno                      journal_errno
-#define jbd2_journal_extend                     journal_extend
-#define jbd2_journal_flush                      journal_flush
-#define jbd2_journal_force_commit               journal_force_commit
-#define jbd2_journal_get_write_access           journal_get_write_access
-#define jbd2_journal_get_undo_access            journal_get_undo_access
-#define jbd2_journal_init_inode                 journal_init_inode
-#define jbd2_journal_invalidatepage             journal_invalidatepage
-#define jbd2_journal_load                       journal_load
-#define jbd2_journal_lock_updates               journal_lock_updates
-#define jbd2_journal_restart                    journal_restart
-#define jbd2_journal_start                      journal_start
-#define jbd2_journal_start_commit               journal_start_commit
-#define jbd2_journal_stop                       journal_stop
-#define jbd2_journal_try_to_free_buffers        journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates             journal_unlock_updates
-#define jbd2_journal_wipe                       journal_wipe
-#define jbd2_log_wait_commit                    log_wait_commit
-static inline int jbd2_journal_file_inode(handle_t *handle,
-                                          struct jbd2_inode *inode)
-{
-        return 0;
-}
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-                                                      loff_t new_size)
-{
-        return 0;
-}
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-                                               struct inode *inode)
-{
-        return;
-}
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-                                                  struct jbd2_inode *jinode)
-{
-        return;
-}
-#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
+        OCFS2_LOCK_TYPE_QINFO,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_FLOCK:
                        c = 'F';
                        break;
+                case OCFS2_LOCK_TYPE_QINFO:
+                        c = 'Q';
+                        break;
                default:
                        c = '\0';
        }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+#include "ocfs2.h"
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+        struct dquot dq_dquot;  /* Generic VFS dquot */
+        loff_t dq_local_off;    /* Offset in the local quota file */
+        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
+        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
+        s64 dq_origspace;       /* Last globally synced space usage */
+        s64 dq_originodes;      /* Last globally synced inode usage */
+};
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+        struct list_head rc_list;       /* List of chunks */
+        int rc_chunk;                   /* Chunk number */
+        unsigned long *rc_bitmap;       /* Bitmap of entries to recover */
+};
+struct ocfs2_quota_recovery {
+        struct list_head r_list[MAXQUOTAS];     /* List of chunks to recover */
+};
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
+        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
+        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
+        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
+        struct list_head dqi_chunk;     /* List of chunks */
+        struct inode *dqi_gqinode;      /* Global quota file inode */
+        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
+        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
+        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
+                                                 * information, in case we
+                                                 * enable quotas on file
+                                                 * needing it */
+};
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+        return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+struct ocfs2_quota_chunk {
+        struct list_head qc_chunk;      /* List of quotafile chunks */
+        int qc_num;                     /* Number of quota chunk */
+        struct buffer_head *qc_headerbh;        /* Buffer head with chunk header */
+};
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+extern struct qtree_fmt_operations ocfs2_global_ops;
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 1);
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh);
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..6aff8f2d3e49
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+static void qsync_work_fn(struct work_struct *work);
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        /* Update from disk only entries not set by the admin */
+        if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+                m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+                m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+        if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+                m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+                m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+        if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+                m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+                m->dqb_itime = le64_to_cpu(d->dqb_itime);
+        OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
+        d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+        d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+        d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+        d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+        d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+        d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+        d->dqb_btime = cpu_to_le64(m->dqb_btime);
+        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+                return 0;
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+struct qtree_fmt_operations ocfs2_global_ops = {
+        .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+        .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+        .is_id = ocfs2_global_is_id,
+};
+static int ocfs2_validate_quota_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+        mlog(0, "Validating quota block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+                                 struct buffer_head **bh)
+{
+        u64 pblock, pcount;
+        int err;
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        *bh = sb_getblk(inode->i_sb, pblock);
+        if (!*bh) {
+                err = -EIO;
+                mlog_errno(err);
+        }
+        return err;;
+}
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        loff_t i_size = i_size_read(gqinode);
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0;
+        struct buffer_head *bh;
+        size_t toread, tocopy;
+        if (off > i_size)
+                return 0;
+        if (off + len > i_size)
+                len = i_size - off;
+        toread = len;
+        while (toread > 0) {
+                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                bh = NULL;
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                if (err) {
+                        mlog_errno(err);
+                        return err;
+                }
+                memcpy(data, bh->b_data + offset, tocopy);
+                brelse(bh);
+                offset = 0;
+                toread -= tocopy;
+                data += tocopy;
+                blk++;
+        }
+        return len;
+}
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0, new = 0, ja_type;
+        struct buffer_head *bh = NULL;
+        handle_t *handle = journal_current_handle();
+        if (!handle) {
+                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+                     "because transaction was not started.\n",
+                     (unsigned long long)off, (unsigned long long)len);
+                return -EIO;
+        }
+        if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+                WARN_ON(1);
+                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+        }
+        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+        if (gqinode->i_size < off + len) {
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
+                err = ocfs2_simple_size_update(gqinode,
+                                               oinfo->dqi_gqi_bh,
+                                               off + len);
+                if (err < 0)
+                        goto out;
+                new = 1;
+        }
+        /* Not rewriting whole block? */
+        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+            !new) {
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        } else {
+                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+        }
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        lock_buffer(bh);
+        if (new)
+                memset(bh->b_data, 0, sb->s_blocksize);
+        memcpy(bh->b_data + offset, data, len);
+        flush_dcache_page(bh->b_page);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        ocfs2_set_buffer_uptodate(gqinode, bh);
+        err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+        if (err < 0) {
+                brelse(bh);
+                goto out;
+        }
+        err = ocfs2_journal_dirty(handle, bh);
+        brelse(bh);
+        if (err < 0)
+                goto out;
+out:
+        if (err) {
+                mutex_unlock(&gqinode->i_mutex);
+                mlog_errno(err);
+                return err;
+        }
+        gqinode->i_version++;
+        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+        mutex_unlock(&gqinode->i_mutex);
+        return len;
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        int status;
+        struct buffer_head *bh = NULL;
+        status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+        if (status < 0)
+                return status;
+        spin_lock(&dq_data_lock);
+        if (!oinfo->dqi_gqi_count++)
+                oinfo->dqi_gqi_bh = bh;
+        else
+                WARN_ON(bh != oinfo->dqi_gqi_bh);
+        spin_unlock(&dq_data_lock);
+        return 0;
+}
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+        brelse(oinfo->dqi_gqi_bh);
+        spin_lock(&dq_data_lock);
+        if (!--oinfo->dqi_gqi_count)
+                oinfo->dqi_gqi_bh = NULL;
+        spin_unlock(&dq_data_lock);
+}
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+        struct inode *gqinode = NULL;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct ocfs2_global_disk_dqinfo dinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        int status;
+        mlog_entry_void();
+        /* Read global header */
+        gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                        OCFS2_INVALID_SLOT);
+        if (!gqinode) {
+                mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+                        type);
+                status = -EINVAL;
+                goto out_err;
+        }
+        oinfo->dqi_gi.dqi_sb = sb;
+        oinfo->dqi_gi.dqi_type = type;
+        ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+        oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+        oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+        oinfo->dqi_gqi_bh = NULL;
+        oinfo->dqi_gqi_count = 0;
+        oinfo->dqi_gqinode = gqinode;
+        status = ocfs2_lock_global_qf(oinfo, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+                                      sizeof(struct ocfs2_global_disk_dqinfo),
+                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_unlock_global_qf(oinfo, 0);
+        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+                     status);
+                if (status >= 0)
+                        status = -EIO;
+                mlog_errno(status);
+                goto out_err;
+        }
+        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+        oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+                                                OCFS2_QBLK_RESERVED_SPACE;
+        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_global_disk_dqinfo dinfo;
+        ssize_t size;
+        spin_lock(&dq_data_lock);
+        info->dqi_flags &= ~DQF_INFO_DIRTY;
+        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+        spin_unlock(&dq_data_lock);
+        dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+        dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+        dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+        dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+                                     sizeof(struct ocfs2_global_disk_dqinfo),
+                                     OCFS2_GLOBAL_INFO_OFF);
+        if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot write global quota info structure\n");
+                if (size >= 0)
+                        size = -EIO;
+                return size;
+        }
+        return 0;
+}
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        int err;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 1);
+        if (err < 0)
+                return err;
+        err = __ocfs2_global_write_info(sb, type);
+        ocfs2_qinfo_unlock(info, 1);
+        return err;
+}
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+        int err, err2, ex = 0;
+        struct ocfs2_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 0);
+        if (err < 0)
+                goto out;
+        err = qtree_read_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                /* Upgrade to exclusive lock for allocation */
+                err = ocfs2_qinfo_lock(info, 1);
+                if (err < 0)
+                        goto out_qlock;
+                ex = 1;
+        }
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+                if (!err)
+                        err = err2;
+        }
+out_qlock:
+        if (ex)
+                ocfs2_qinfo_unlock(info, 1);
+        ocfs2_qinfo_unlock(info, 0);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+        int err, err2;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_global_disk_dqblk dqblk;
+        s64 spacechange, inodechange;
+        time_t olditime, oldbtime;
+        err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+                                   sizeof(struct ocfs2_global_disk_dqblk),
+                                   dquot->dq_off);
+        if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+                if (err >= 0) {
+                        mlog(ML_ERROR, "Short read from global quota file "
+                                       "(%u read)\n", err);
+                        err = -EIO;
+                }
+                goto out;
+        }
+        /* Update space and inode usage. Get also other information from
+         * global quota file so that we don't overwrite any changes there.
+         * We are */
+        spin_lock(&dq_data_lock);
+        spacechange = dquot->dq_dqb.dqb_curspace -
+                                        OCFS2_DQUOT(dquot)->dq_origspace;
+        inodechange = dquot->dq_dqb.dqb_curinodes -
+                                        OCFS2_DQUOT(dquot)->dq_originodes;
+        olditime = dquot->dq_dqb.dqb_itime;
+        oldbtime = dquot->dq_dqb.dqb_btime;
+        ocfs2_global_disk2memdqb(dquot, &dqblk);
+        mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+             dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+             dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curspace += spacechange;
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curinodes += inodechange;
+        /* Set properly space grace time... */
+        if (dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+                    oldbtime > 0) {
+                        if (dquot->dq_dqb.dqb_btime > 0)
+                                dquot->dq_dqb.dqb_btime =
+                                        min(dquot->dq_dqb.dqb_btime, oldbtime);
+                        else
+                                dquot->dq_dqb.dqb_btime = oldbtime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_btime = 0;
+                clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+        }
+        /* Set properly inode grace time... */
+        if (dquot->dq_dqb.dqb_isoftlimit &&
+            dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+                    olditime > 0) {
+                        if (dquot->dq_dqb.dqb_itime > 0)
+                                dquot->dq_dqb.dqb_itime =
+                                        min(dquot->dq_dqb.dqb_itime, olditime);
+                        else
+                                dquot->dq_dqb.dqb_itime = olditime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_itime = 0;
+                clear_bit(DQ_INODES_B, &dquot->dq_flags);
+        }
+        /* All information is properly updated, clear the flags */
+        __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        spin_unlock(&dq_data_lock);
+        err = ocfs2_qinfo_lock(info, freeing);
+        if (err < 0) {
+                mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+                               " (type=%d, id=%u)\n", dquot->dq_type,
+                               (unsigned)dquot->dq_id);
+                goto out;
+        }
+        if (freeing)
+                OCFS2_DQUOT(dquot)->dq_use_count--;
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+                err = qtree_release_dquot(&info->dqi_gi, dquot);
+                if (info_dirty(sb_dqinfo(sb, type))) {
+                        err2 = __ocfs2_global_write_info(sb, type);
+                        if (!err)
+                                err = err2;
+                }
+        }
+out_qlock:
+        ocfs2_qinfo_unlock(info, freeing);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+        handle_t *handle;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int status = 0;
+        mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+                   dquot->dq_type, type, sb->s_id);
+        if (type != dquot->dq_type)
+                goto out;
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+        status = ocfs2_sync_dquot(dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+        if (status < 0)
+                mlog_errno(status);
+        /* We have to write local structure as well... */
+        dquot_mark_dquot_dirty(dquot);
+        status = dquot_commit(dquot);
+        if (status < 0)
+                mlog_errno(status);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static void qsync_work_fn(struct work_struct *work)
+{
+        struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+                                                      struct ocfs2_mem_dqinfo,
+                                                      dqi_sync_work.work);
+        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+}
+/*
+ *  Wrappers for generic quota functions
+ */
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = dquot_commit(dquot);
+        ocfs2_commit_trans(osb, handle);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /* We modify tree, leaf block, global info, local chunk header,
+         * global and local inode */
+        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+               2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_release(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        struct ocfs2_dinode *lfe, *gfe;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+        /* We can extend local file + global file. In local file we
+         * can modify info, chunk header block and dquot block. In
+         * global file we can modify info, tree and leaf block */
+        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        /* We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure */
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_acquire(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+        unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+        int sync = 0;
+        int status;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+        dquot_mark_dquot_dirty(dquot);
+        /* In case user set some limits, sync dquot immediately to global
+         * quota file so that information propagates quicker */
+        spin_lock(&dq_data_lock);
+        if (dquot->dq_flags & mask)
+                sync = 1;
+        spin_unlock(&dq_data_lock);
+        if (!sync) {
+                status = ocfs2_write_dquot(dquot);
+                goto out;
+        }
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = ocfs2_sync_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Now write updated local dquot structure */
+        status = dquot_commit(dquot);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+        handle_t *handle;
+        int status = 0;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        mlog_entry_void();
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_commit_info(sb, type);
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+        handle_t *handle = NULL;
+        int status = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo;
+        int exclusive = 0;
+        int cnt;
+        qid_t id;
+        mlog_entry_void();
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (type != -1 && cnt != type)
+                        continue;
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                status = ocfs2_lock_global_qf(oinfo, 0);
+                if (status < 0)
+                        goto out;
+                /* This is just a performance optimization not a reliable test.
+                 * Since we hold an inode lock, noone can actually release
+                 * the structure until we are finished with initialization. */
+                if (inode->i_dquot[cnt] != NODQUOT) {
+                        ocfs2_unlock_global_qf(oinfo, 0);
+                        continue;
+                }
+                /* When we have inode lock, we know that no dquot_release() can
+                 * run and thus we can safely check whether we need to
+                 * read+modify global file to get quota information or whether
+                 * our node already has it. */
+                if (cnt == USRQUOTA)
+                        id = inode->i_uid;
+                else if (cnt == GRPQUOTA)
+                        id = inode->i_gid;
+                else
+                        BUG();
+                /* Obtain exclusion from quota off... */
+                down_write(&sb_dqopt(sb)->dqptr_sem);
+                exclusive = !dquot_is_cached(sb, id, cnt);
+                up_write(&sb_dqopt(sb)->dqptr_sem);
+                if (exclusive) {
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0) {
+                                exclusive = 0;
+                                mlog_errno(status);
+                                goto out_ilock;
+                        }
+                        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                        ocfs2_calc_qinit_credits(sb, cnt));
+                        if (IS_ERR(handle)) {
+                                status = PTR_ERR(handle);
+                                mlog_errno(status);
+                                goto out_ilock;
+                        }
+                }
+                dquot_initialize(inode, cnt);
+                if (exclusive) {
+                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                        ocfs2_unlock_global_qf(oinfo, 1);
+                }
+                ocfs2_unlock_global_qf(oinfo, 0);
+        }
+        mlog_exit(0);
+        return 0;
+out_ilock:
+        if (exclusive)
+                ocfs2_unlock_global_qf(oinfo, 1);
+        ocfs2_unlock_global_qf(oinfo, 0);
+out:
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+        int status = 0;
+        int cnt;
+        int got_lock[MAXQUOTAS] = {0, 0};
+        handle_t *handle;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                status = ocfs2_lock_global_qf(oinfo, 1);
+                if (status < 0)
+                        goto out;
+                got_lock[cnt] = 1;
+        }
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                        ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        dquot_drop(inode);
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (got_lock[cnt]) {
+                        oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                        ocfs2_unlock_global_qf(oinfo, 1);
+                }
+        return status;
+}
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+        int status = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo;
+        int exclusive = 0;
+        int cnt;
+        int got_lock[MAXQUOTAS] = {0, 0};
+        mlog_entry_void();
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                status = ocfs2_lock_global_qf(oinfo, 0);
+                if (status < 0)
+                        goto out;
+                got_lock[cnt] = 1;
+        }
+        /* Lock against anyone releasing references so that when when we check
+         * we know we are not going to be last ones to release dquot */
+        down_write(&sb_dqopt(sb)->dqptr_sem);
+        /* Urgh, this is a terrible hack :( */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (inode->i_dquot[cnt] != NODQUOT &&
+                    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+                        exclusive = 1;
+                        break;
+                }
+        }
+        if (!exclusive)
+                dquot_drop_locked(inode);
+        up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (got_lock[cnt]) {
+                        oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                        ocfs2_unlock_global_qf(oinfo, 0);
+                }
+        /* In case we bailed out because we had to do expensive locking
+         * do it now... */
+        if (exclusive)
+                status = ocfs2_dquot_drop_slow(inode);
+        mlog_exit(status);
+        return status;
+}
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+        struct ocfs2_dquot *dquot =
+                                kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+        if (!dquot)
+                return NULL;
+        return &dquot->dq_dquot;
+}
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+        kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+struct dquot_operations ocfs2_quota_operations = {
+        .initialize     = ocfs2_dquot_initialize,
+        .drop           = ocfs2_dquot_drop,
+        .alloc_space    = dquot_alloc_space,
+        .alloc_inode    = dquot_alloc_inode,
+        .free_space     = dquot_free_space,
+        .free_inode     = dquot_free_inode,
+        .transfer       = dquot_transfer,
+        .write_dquot    = ocfs2_write_dquot,
+        .acquire_dquot  = ocfs2_acquire_dquot,
+        .release_dquot  = ocfs2_release_dquot,
+        .mark_dirty     = ocfs2_mark_dquot_dirty,
+        .write_info     = ocfs2_write_info,
+        .alloc_dquot    = ocfs2_alloc_dquot,
+        .destroy_dquot  = ocfs2_destroy_dquot,
+};
+int ocfs2_quota_setup(void)
+{
+        ocfs2_quota_wq = create_workqueue("o2quot");
+        if (!ocfs2_quota_wq)
+                return -ENOMEM;
+        return 0;
+}
+void ocfs2_quota_shutdown(void)
+{
+        if (ocfs2_quota_wq) {
+                flush_workqueue(ocfs2_quota_wq);
+                destroy_workqueue(ocfs2_quota_wq);
+                ocfs2_quota_wq = NULL;
+        }
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+        return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+                sizeof(struct ocfs2_local_disk_dqblk));
+}
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+        return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+                 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+               ol_quota_entries_per_block(sb);
+}
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+        return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+        /* 1 block for local quota file info, 1 block per chunk for chunk info */
+        return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+        return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+               ol_dqblk_block_off(sb, c, off);
+}
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+        return off >> sb->s_blocksize_bits;
+}
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+        return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ((off >> sb->s_blocksize_bits) -
+                        ol_quota_chunk_block(sb, c) - 1) * epb
+               + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+                 sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+                void (*modify)(struct buffer_head *, void *), void *private)
+{
+        struct super_block *sb = inode->i_sb;
+        handle_t *handle;
+        int status;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                return status;
+        }
+        status = ocfs2_journal_access_dq(handle, inode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        lock_buffer(bh);
+        modify(bh, private);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        return 0;
+}
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+        unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+        unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+        unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+        unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct buffer_head *bh = NULL;
+        struct inode *linode = sb_dqopt(sb)->files[type];
+        struct inode *ginode = NULL;
+        struct ocfs2_disk_dqheader *dqhead;
+        int status, ret = 0;
+        /* First check whether we understand local quota file */
+        status = ocfs2_read_quota_block(linode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+                        type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+                mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+                        lmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+                mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+                        lversions[type], type);
+                goto out_err;
+        }
+        brelse(bh);
+        bh = NULL;
+        /* Next check whether we understand global quota file */
+        ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                                                OCFS2_INVALID_SLOT);
+        if (!ginode) {
+                mlog(ML_ERROR, "cannot get global quota file inode "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        /* Since the header is read only, we don't care about locking */
+        status = ocfs2_read_quota_block(ginode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read global quota file header "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+                mlog(ML_ERROR, "global quota file magic does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+                mlog(ML_ERROR, "global quota file version does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_version), gversions[type],
+                        type);
+                goto out_err;
+        }
+        ret = 1;
+out_err:
+        brelse(bh);
+        iput(ginode);
+        return ret;
+}
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+        struct ocfs2_quota_chunk *pos, *next;
+        list_for_each_entry_safe(pos, next, head, qc_chunk) {
+                list_del(&pos->qc_chunk);
+                brelse(pos->qc_headerbh);
+                kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+        }
+}
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+                        struct ocfs2_local_disk_dqinfo *ldinfo,
+                        struct list_head *head)
+{
+        struct ocfs2_quota_chunk *newchunk;
+        int i, status;
+        INIT_LIST_HEAD(head);
+        for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+                newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+                if (!newchunk) {
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return -ENOMEM;
+                }
+                newchunk->qc_num = i;
+                newchunk->qc_headerbh = NULL;
+                status = ocfs2_read_quota_block(inode,
+                                ol_quota_chunk_block(inode->i_sb, i),
+                                &newchunk->qc_headerbh);
+                if (status) {
+                        mlog_errno(status);
+                        kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return status;
+                }
+                list_add_tail(&newchunk->qc_chunk, head);
+        }
+        return 0;
+}
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+        struct mem_dqinfo *info = private;
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        spin_lock(&dq_data_lock);
+        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+        spin_unlock(&dq_data_lock);
+}
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+                                    struct ocfs2_local_disk_chunk *dchunk,
+                                    int chunk,
+                                    struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *rc;
+        rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        rc->rc_chunk = chunk;
+        rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+        if (!rc->rc_bitmap) {
+                kfree(rc);
+                return -ENOMEM;
+        }
+        memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+               (ol_chunk_entries(sb) + 7) >> 3);
+        list_add_tail(&rc->rc_list, head);
+        return 0;
+}
+static void free_recovery_list(struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *next;
+        struct ocfs2_recovery_chunk *rchunk;
+        list_for_each_entry_safe(rchunk, next, head, rc_list) {
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+        }
+}
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+        int type;
+        for (type = 0; type < MAXQUOTAS; type++)
+                free_recovery_list(&(rec->r_list[type]));
+        kfree(rec);
+}
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+                                     struct ocfs2_local_disk_dqinfo *ldinfo,
+                                     int type,
+                                     struct list_head *head)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct buffer_head *hbh;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        int status = 0;
+        for (i = 0; i < chunks; i++) {
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, i),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+                        status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+                brelse(hbh);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                free_recovery_list(head);
+        return status;
+}
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+        int type;
+        struct ocfs2_quota_recovery *rec;
+        rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+        if (!rec)
+                return NULL;
+        for (type = 0; type < MAXQUOTAS; type++)
+                INIT_LIST_HEAD(&(rec->r_list[type]));
+        return rec;
+}
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                                struct ocfs2_super *osb,
+                                                int slot_num)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct inode *lqinode;
+        struct buffer_head *bh;
+        int type;
+        int status = 0;
+        struct ocfs2_quota_recovery *rec;
+        mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+        rec = ocfs2_alloc_quota_recovery();
+        if (!rec)
+                return ERR_PTR(-ENOMEM);
+        /* First init... */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                /* At this point, journal of the slot is already replayed so
+                 * we can trust metadata and data of the quota file */
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                               OCFS2_META_LOCK_RECOVERY);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        if (status < 0) {
+                ocfs2_free_quota_recovery(rec);
+                rec = ERR_PTR(status);
+        }
+        return rec;
+}
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+                                          int type,
+                                          struct ocfs2_quota_recovery *rec)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_local_disk_chunk *dchunk;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct dquot *dquot;
+        handle_t *handle;
+        struct buffer_head *hbh = NULL, *qbh = NULL;
+        int status = 0;
+        int bit, chunk;
+        struct ocfs2_recovery_chunk *rchunk, *next;
+        qsize_t spacechange, inodechange;
+        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+                chunk = rchunk->rc_chunk;
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, chunk),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+                        qbh = NULL;
+                        status = ocfs2_read_quota_block(lqinode,
+                                                ol_dqblk_block(sb, chunk, bit),
+                                                &qbh);
+                        if (status) {
+                                mlog_errno(status);
+                                break;
+                        }
+                        dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+                                ol_dqblk_block_off(sb, chunk, bit));
+                        dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+                        if (!dquot) {
+                                status = -EIO;
+                                mlog(ML_ERROR, "Failed to get quota structure "
+                                     "for id %u, type %d. Cannot finish quota "
+                                     "file recovery.\n",
+                                     (unsigned)le64_to_cpu(dqblk->dqb_id),
+                                     type);
+                                goto out_put_bh;
+                        }
+                        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                                   OCFS2_QSYNC_CREDITS);
+                        if (IS_ERR(handle)) {
+                                status = PTR_ERR(handle);
+                                mlog_errno(status);
+                                goto out_put_dquot;
+                        }
+                        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+                        spin_lock(&dq_data_lock);
+                        /* Add usage from quota entry into quota changes
+                         * of our node. Auxiliary variables are important
+                         * due to signedness */
+                        spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+                        inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+                        dquot->dq_dqb.dqb_curspace += spacechange;
+                        dquot->dq_dqb.dqb_curinodes += inodechange;
+                        spin_unlock(&dq_data_lock);
+                        /* We want to drop reference held by the crashed
+                         * node. Since we have our own reference we know
+                         * global structure actually won't be freed. */
+                        status = ocfs2_global_release_dquot(dquot);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        /* Release local quota file entry */
+                        status = ocfs2_journal_access_dq(handle, lqinode,
+                                        qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        lock_buffer(qbh);
+                        WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+                        le32_add_cpu(&dchunk->dqc_free, 1);
+                        unlock_buffer(qbh);
+                        status = ocfs2_journal_dirty(handle, qbh);
+                        if (status < 0)
+                                mlog_errno(status);
+out_commit:
+                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+                        dqput(dquot);
+out_put_bh:
+                        brelse(qbh);
+                        if (status < 0)
+                                break;
+                }
+                brelse(hbh);
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+                if (status < 0)
+                        break;
+        }
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        if (status < 0)
+                free_recovery_list(&(rec->r_list[type]));
+        mlog_exit(status);
+        return status;
+}
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num)
+{
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct buffer_head *bh;
+        handle_t *handle;
+        int type;
+        int status = 0;
+        struct inode *lqinode;
+        unsigned int flags;
+        mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (list_empty(&(rec->r_list[type])))
+                        continue;
+                mlog(0, "Recovering quota in slot %d\n", slot_num);
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                                       OCFS2_META_LOCK_NOQUEUE);
+                /* Someone else is holding the lock? Then he must be
+                 * doing the recovery. Just skip the file... */
+                if (status == -EAGAIN) {
+                        mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+                             "because quota file is locked.\n", slot_num);
+                        status = 0;
+                        goto out_put;
+                } else if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                /* Is recovery still needed? */
+                flags = le32_to_cpu(ldinfo->dqi_flags);
+                if (!(flags & OLQF_CLEAN))
+                        status = ocfs2_recover_local_quota_file(lqinode,
+                                                                type,
+                                                                rec);
+                /* We don't want to mark file as clean when it is actually
+                 * active */
+                if (slot_num == osb->slot_num)
+                        goto out_bh;
+                /* Mark quota file as clean if we are recovering quota file of
+                 * some other node. */
+                handle = ocfs2_start_trans(osb, 1);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out_bh;
+                }
+                status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_trans;
+                }
+                lock_buffer(bh);
+                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+                unlock_buffer(bh);
+                status = ocfs2_journal_dirty(handle, bh);
+                if (status < 0)
+                        mlog_errno(status);
+out_trans:
+                ocfs2_commit_trans(osb, handle);
+out_bh:
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        kfree(rec);
+        return status;
+}
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        int status;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_quota_recovery *rec;
+        int locked = 0;
+        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+        if (!oinfo) {
+                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+                               " info.");
+                goto out_err;
+        }
+        info->dqi_priv = oinfo;
+        oinfo->dqi_type = type;
+        INIT_LIST_HEAD(&oinfo->dqi_chunk);
+        oinfo->dqi_rec = NULL;
+        oinfo->dqi_lqi_bh = NULL;
+        oinfo->dqi_ibh = NULL;
+        status = ocfs2_global_read_info(sb, type);
+        if (status < 0)
+                goto out_err;
+        status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        locked = 1;
+        /* Now read local header */
+        status = ocfs2_read_quota_block(lqinode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file info header "
+                        "(type=%d)\n", type);
+                goto out_err;
+        }
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+        oinfo->dqi_ibh = bh;
+        /* We crashed when using local quota file? */
+        if (!(info->dqi_flags & OLQF_CLEAN)) {
+                rec = OCFS2_SB(sb)->quota_rec;
+                if (!rec) {
+                        rec = ocfs2_alloc_quota_recovery();
+                        if (!rec) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                goto out_err;
+                        }
+                        OCFS2_SB(sb)->quota_rec = rec;
+                }
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_err;
+                }
+        }
+        status = ocfs2_load_local_quota_bitmaps(lqinode,
+                                                ldinfo,
+                                                &oinfo->dqi_chunk);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now mark quota file as used */
+        info->dqi_flags &= ~OLQF_CLEAN;
+        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        return 0;
+out_err:
+        if (oinfo) {
+                iput(oinfo->dqi_gqinode);
+                ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+                ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+                brelse(oinfo->dqi_lqi_bh);
+                if (locked)
+                        ocfs2_inode_unlock(lqinode, 1);
+                ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+                kfree(oinfo);
+        }
+        brelse(bh);
+        return -1;
+}
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+                                                ->dqi_ibh;
+        int status;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                return -1;
+        }
+        return 0;
+}
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int mark_clean = 1, len;
+        int status;
+        /* At this point we know there are no more dquots and thus
+         * even if there's some sync in the pdflush queue, it won't
+         * find any dquots and return without doing anything */
+        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+        iput(oinfo->dqi_gqinode);
+        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                        (chunk->qc_headerbh->b_data);
+                if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                        len = ol_chunk_entries(sb);
+                } else {
+                        len = (oinfo->dqi_blocks -
+                               ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                              * ol_quota_entries_per_block(sb);
+                }
+                /* Not all entries free? Bug! */
+                if (le32_to_cpu(dchunk->dqc_free) != len) {
+                        mlog(ML_ERROR, "releasing quota file with used "
+                                        "entries (type=%d)\n", type);
+                        mark_clean = 0;
+                }
+        }
+        ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+        /* dqonoff_mutex protects us against racing with recovery thread... */
+        if (oinfo->dqi_rec) {
+                ocfs2_free_quota_recovery(oinfo->dqi_rec);
+                mark_clean = 0;
+        }
+        if (!mark_clean)
+                goto out;
+        /* Mark local file as clean */
+        info->dqi_flags |= OLQF_CLEAN;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+                                 oinfo->dqi_ibh,
+                                 olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_lqi_bh);
+        kfree(oinfo);
+        return 0;
+}
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+        struct ocfs2_dquot *od = private;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct super_block *sb = od->dq_dquot.dq_sb;
+        dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+                + ol_dqblk_block_offset(sb, od->dq_local_off));
+        dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+        spin_lock(&dq_data_lock);
+        dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+                                          od->dq_origspace);
+        dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+                                          od->dq_originodes);
+        spin_unlock(&dq_data_lock);
+        mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+             od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+             (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct buffer_head *bh = NULL;
+        int status;
+        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                    &bh);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+                                 olq_set_dquot, od);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        brelse(bh);
+        return status;
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int found = 0, len;
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                                chunk->qc_headerbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) > 0) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                return NULL;
+        if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                len = ol_chunk_entries(sb);
+        } else {
+                len = (oinfo->dqi_blocks -
+                       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                      * ol_quota_entries_per_block(sb);
+        }
+        found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+        /* We failed? */
+        if (found == len) {
+                mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+                     " entries free (type=%d)\n", chunk->qc_num,
+                     le32_to_cpu(dchunk->dqc_free), type);
+                return ERR_PTR(-EIO);
+        }
+        *offset = found;
+        return chunk;
+}
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+                                                        struct super_block *sb,
+                                                        int type,
+                                                        int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk = NULL;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int status;
+        handle_t *handle;
+        struct buffer_head *bh = NULL;
+        u64 p_blkno;
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + 2 * sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + 2 * sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+        if (!chunk) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+        memset(dchunk->dqc_bitmap, 0,
+               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+               OCFS2_QBLK_RESERVED_SPACE);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks += 2;
+        oinfo->dqi_chunks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+        chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+                                   struct ocfs2_quota_chunk,
+                                   qc_chunk)->qc_num + 1;
+        chunk->qc_headerbh = bh;
+        *offset = 0;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        brelse(bh);
+        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+        return ERR_PTR(status);
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+                                                       struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_local_disk_chunk *dchunk;
+        int epb = ol_quota_entries_per_block(sb);
+        unsigned int chunk_blocks;
+        int status;
+        handle_t *handle;
+        if (list_empty(&oinfo->dqi_chunk))
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* Is the last chunk full? */
+        chunk = list_entry(oinfo->dqi_chunk.prev,
+                        struct ocfs2_quota_chunk, qc_chunk);
+        chunk_blocks = oinfo->dqi_blocks -
+                        ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+        if (ol_chunk_blocks(sb) == chunk_blocks)
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+                                 OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+        lock_buffer(chunk->qc_headerbh);
+        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+        unlock_buffer(chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        *offset = chunk_blocks * epb;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        return ERR_PTR(status);
+}
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+        int *offset = private;
+        struct ocfs2_local_disk_chunk *dchunk;
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, -1);
+}
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        int offset;
+        int status;
+        chunk = ocfs2_find_free_entry(sb, type, &offset);
+        if (!chunk) {
+                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+                if (IS_ERR(chunk))
+                        return PTR_ERR(chunk);
+        } else if (IS_ERR(chunk)) {
+                return PTR_ERR(chunk);
+        }
+        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+        od->dq_chunk = chunk;
+        /* Initialize dquot structure on disk */
+        status = ocfs2_local_write_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        /* Mark structure as allocated */
+        status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+                                 &offset);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        return status;
+}
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+        int status;
+        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_global_read_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now create entry in the local quota file */
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        mlog_exit(0);
+        return 0;
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+        int status;
+        int type = dquot->dq_type;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int offset;
+        handle_t *handle = journal_current_handle();
+        BUG_ON(!handle);
+        /* First write all local changes to global file */
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+                        od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+                                             od->dq_local_off);
+        dchunk = (struct ocfs2_local_disk_chunk *)
+                        (od->dq_chunk->qc_headerbh->b_data);
+        /* Mark structure as freed */
+        lock_buffer(od->dq_chunk->qc_headerbh);
+        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, 1);
+        unlock_buffer(od->dq_chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = 0;
+out:
+        /* Clear the read bit so that next time someone uses this
+         * dquot he reads fresh info from disk and allocates local
+         * dquot structure */
+        clear_bit(DQ_READ_B, &dquot->dq_flags);
+        return status;
+}
+static struct quota_format_ops ocfs2_format_ops = {
+        .check_quota_file       = ocfs2_local_check_quota_file,
+        .read_file_info         = ocfs2_local_read_info,
+        .write_file_info        = ocfs2_global_write_info,
+        .free_file_info         = ocfs2_local_free_info,
+        .read_dqblk             = ocfs2_local_read_dquot,
+        .commit_dqblk           = ocfs2_local_write_dquot,
+        .release_dqblk          = ocfs2_local_release_dquot,
+};
+struct quota_format_type ocfs2_quota_format = {
+        .qf_fmt_id = QFMT_OCFS2,
+        .qf_ops = &ocfs2_format_ops,
+        .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
                   new_clusters, first_new_cluster);
-        ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        }
        /* update the inode accordingly. */
-        ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+        ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+         * so any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
                goto out_unlock;
        }
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-                ret = -EIO;
-                goto out_unlock;
-        }
        first_new_cluster = le32_to_cpu(fe->i_clusters);
        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
                                              first_new_cluster - 1);
-        ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        group = (struct ocfs2_group_desc *)group_bh->b_data;
-        ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_unlock;
-        }
        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
        if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
                le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
                                 struct buffer_head *group_bh)
 {
        int ret;
-        struct ocfs2_group_desc *gd;
+        struct ocfs2_group_desc *gd =
+                (struct ocfs2_group_desc *)group_bh->b_data;
        u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-        unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-                                le16_to_cpu(di->id2.i_chain.cl_bpc);
-        gd = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+        if (ret)
+                goto out;
-        ret = -EIO;
+        ret = -EINVAL;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+        if (le16_to_cpu(gd->bg_chain) != input->chain)
-                mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-        else if (di->i_blkno != gd->bg_parent_dinode)
-                mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-                     "pointer (%llu, expected %llu)\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                     (unsigned long long)le64_to_cpu(di->i_blkno));
-        else if (le16_to_cpu(gd->bg_bits) > max_bits)
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits));
-        else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "claims that %u are free\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     le16_to_cpu(gd->bg_free_bits_count));
-        else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "max bitmap bits of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     8 * le16_to_cpu(gd->bg_size));
-        else if (le16_to_cpu(gd->bg_chain) != input->chain)
                mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
                     "while input has %u set.\n",
                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
        else
                ret = 0;
+out:
        return ret;
 }
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        cl = &fe->id2.i_chain;
        cr = &cl->cl_recs[input->chain];
-        ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out_commit;
        }
-        ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+        ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
         * this is not true, the read of -1 (UINT64_MAX) will fail.
         */
        ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-                                OCFS2_BH_IGNORE_CACHE);
+                                OCFS2_BH_IGNORE_CACHE, NULL);
        if (ret == 0) {
                spin_lock(&osb->osb_lock);
                ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
                bh = NULL;  /* Acquire a fresh bh */
                status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                           OCFS2_BH_IGNORE_CACHE, NULL);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
-/* somewhat more expensive than our other checks, so use sparingly. */
+#define do_error(fmt, ...)                                              \
-int ocfs2_check_group_descriptor(struct super_block *sb,
+        do{                                                             \
-                                 struct ocfs2_dinode *di,
+                if (clean_error)                                        \
-                                 struct ocfs2_group_desc *gd)
+                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
+                else                                                    \
+                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
+        } while (0)
+static int ocfs2_validate_gd_self(struct super_block *sb,
+                                  struct buffer_head *bh,
+                                  int clean_error)
 {
-        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
+                do_error("Group descriptor #%llu has bad signature %.*s",
-                return -EIO;
+                         (unsigned long long)bh->b_blocknr, 7,
+                         gd->bg_signature);
+                return -EINVAL;
        }
+        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+                do_error("Group descriptor #%llu has an invalid bg_blkno "
+                         "of %llu",
+                         (unsigned long long)bh->b_blocknr,
+                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+                do_error("Group descriptor #%llu has an invalid "
+                         "fs_generation of #%u",
+                         (unsigned long long)bh->b_blocknr,
+                         le32_to_cpu(gd->bg_generation));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "claims that %u are free",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         le16_to_cpu(gd->bg_free_bits_count));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "max bitmap bits of %u",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         8 * le16_to_cpu(gd->bg_size));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+                                    struct ocfs2_dinode *di,
+                                    struct buffer_head *bh,
+                                    int clean_error)
+{
+        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (di->i_blkno != gd->bg_parent_dinode) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad parent "
+                do_error("Group descriptor #%llu has bad parent "
-                            "pointer (%llu, expected %llu)",
+                         "pointer (%llu, expected %llu)",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                         (unsigned long long)le64_to_cpu(di->i_blkno));
-                return -EIO;
+                return -EINVAL;
        }
        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
        if (le16_to_cpu(gd->bg_bits) > max_bits) {
-                ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
+                do_error("Group descriptor #%llu has bit count of %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_bits));
+                         le16_to_cpu(gd->bg_bits));
-                return -EIO;
+                return -EINVAL;
        }
        if (le16_to_cpu(gd->bg_chain) >=
            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
+                do_error("Group descriptor #%llu has bad chain %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_chain));
+                         le16_to_cpu(gd->bg_chain));
-                return -EIO;
+                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+        return 0;
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
+}
-                            "claims that %u are free",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                            le16_to_cpu(gd->bg_bits),
-                            le16_to_cpu(gd->bg_free_bits_count));
-                return -EIO;
-        }
-        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+#undef do_error
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-                            "max bitmap bits of %u",
+/*
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ * This version only prints errors.  It does not fail the filesystem, and
-                            le16_to_cpu(gd->bg_bits),
+ * exists only for resize.
-                            8 * le16_to_cpu(gd->bg_size));
+ */
-                return -EIO;
+int ocfs2_check_group_descriptor(struct super_block *sb,
+                                 struct ocfs2_dinode *di,
+                                 struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc) {
+                mlog(ML_ERROR,
+                     "Checksum failed for group descriptor %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        } else
+                rc = ocfs2_validate_gd_self(sb, bh, 1);
+        if (!rc)
+                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+        return rc;
+}
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+                                           struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        mlog(0, "Validating group descriptor %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal.
+         */
+        return ocfs2_validate_gd_self(sb, bh, 0);
+}
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+                              ocfs2_validate_group_descriptor);
+        if (rc)
+                goto out;
+        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+        if (rc) {
+                brelse(tmp);
+                goto out;
        }
-        return 0;
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc;
 }
 static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      bg_bh,
+                                         bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        status = ocfs2_journal_access(handle, alloc_inode,
+        status = ocfs2_journal_access_di(handle, alloc_inode,
-                                      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_alloc_slot = slot;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+        /* The bh was validated by the inode read inside
-                status = -EIO;
+         * ocfs2_inode_lock().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
                            (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
        int offset, start, found, status = 0;
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* Callers got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                return -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        }
        found = start = best_offset = best_size = 0;
        bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* All callers get the descriptor via
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      group_bh,
+                                         group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The caller got these descriptors from
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-                status = -EIO;
-                goto out;
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-                status = -EIO;
-                goto out;
-        }
        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        bg_ptr = le64_to_cpu(bg->bg_next_group);
        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
-        status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
                bg->bg_next_group = cpu_to_le64(bg_ptr);
                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
        }
-out:
        mlog_exit(status);
        return status;
 }
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-                ret = -EIO;
-                goto out;
-        }
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
                                  ac->ac_max_block, bit_off, &found);
        if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
             bits_wanted, chain,
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
-        status = ocfs2_read_block(alloc_inode,
+        status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
-                                  &group_bh);
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        bg = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = -ENOSPC;
        /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                next_group = le64_to_cpu(bg->bg_next_group);
                prev_group_bh = group_bh;
                group_bh = NULL;
-                status = ocfs2_read_block(alloc_inode,
+                status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                          next_group, &group_bh);
+                                                     next_group, &group_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                bg = (struct ocfs2_group_desc *) group_bh->b_data;
-                status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        }
        if (status < 0) {
                if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        /* Ok, claim our bits now: set the info on dinode, chainlist
         * and then the group */
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_di(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      ac->ac_bh,
+                                         ac->ac_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        BUG_ON(!ac->ac_bh);
        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+        /* The bh was validated by the inode read during
-                status = -EIO;
+         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* The caller got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The alloc_bh comes from ocfs2_free_dinode() or
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_free_clusters().  The callers have all locked the
-                status = -EIO;
+         * allocator and gotten alloc_bh from the lock call.  This
-                goto bail;
+         * validates the dinode buffer.  Any corruption that has happended
-        }
+         * is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
        mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
             (unsigned long long)bg_blkno, start_bit);
-        status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        group = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
 int ocfs2_check_group_descriptor(struct super_block *sb,
                                 struct ocfs2_dinode *di,
-                                 struct ocfs2_group_desc *gd);
+                                 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..43ed11345b59 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
@@ -65,10 +67,13 @@
 #include "uptodate.h"
 #include "ver.h"
 #include "xattr.h"
+#include "quota.h"
 #include "buffer_head_io.h"
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 /* OCFS2 needs to schedule several differnt types of work which
 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 static const struct super_operations ocfs2_sops = {
        .statfs         = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
+        .quota_read     = ocfs2_quota_read,
+        .quota_write    = ocfs2_quota_write,
 };
 enum {
@@ -158,6 +168,10 @@ enum {
        Opt_user_xattr,
        Opt_nouser_xattr,
        Opt_inode64,
+        Opt_acl,
+        Opt_noacl,
+        Opt_usrquota,
+        Opt_grpquota,
        Opt_err,
 };
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_inode64, "inode64"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_usrquota, "usrquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_err, NULL}
 };
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
        return 0;
 }
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+            && (ino == USER_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+                return 0;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+            && (ino == GROUP_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+                return 0;
+        return 1;
+}
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
        struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
             i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
             i < NUM_SYSTEM_INODES;
             i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        /* We're going to/from readonly mode. */
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+                /* Disable quota accounting before remounting RO */
+                if (*flags & MS_RDONLY) {
+                        ret = ocfs2_susp_quotas(osb, 0);
+                        if (ret < 0)
+                                goto out;
+                }
                /* Lock here so the check of HARD_RO and the potential
                 * setting of SOFT_RO is atomic. */
                spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                }
 unlock_osb:
                spin_unlock(&osb->osb_lock);
+                /* Enable quota accounting after remounting RW */
+                if (!ret && !(*flags & MS_RDONLY)) {
+                        if (sb_any_quota_suspended(sb))
+                                ret = ocfs2_susp_quotas(osb, 1);
+                        else
+                                ret = ocfs2_enable_quotas(osb);
+                        if (ret < 0) {
+                                /* Return back changes... */
+                                spin_lock(&osb->osb_lock);
+                                sb->s_flags |= MS_RDONLY;
+                                osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+                                spin_unlock(&osb->osb_lock);
+                                goto out;
+                        }
+                }
        }
        if (!ret) {
                /* Only save off the new mount options in case of a successful
                 * remount. */
+                if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                        parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
        return 0;
 }
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+        int type;
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        int status = 0;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                if (unsuspend)
+                        status = vfs_quota_enable(
+                                        sb_dqopt(sb)->files[type],
+                                        type, QFMT_OCFS2,
+                                        DQUOT_SUSPENDED);
+                else
+                        status = vfs_quota_disable(sb, type,
+                                                   DQUOT_SUSPENDED);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+                     "remount (error = %d).\n", status);
+        return status;
+}
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+        struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        int status;
+        int type;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+                                                        osb->slot_num);
+                if (!inode[type]) {
+                        status = -ENOENT;
+                        goto out_quota_off;
+                }
+                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                                                DQUOT_USAGE_ENABLED);
+                if (status < 0)
+                        goto out_quota_off;
+        }
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        return 0;
+out_quota_off:
+        ocfs2_disable_quotas(osb);
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        mlog_errno(status);
+        return status;
+}
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+        int type;
+        struct inode *inode;
+        struct super_block *sb = osb->sb;
+        /* We mostly ignore errors in this function because there's not much
+         * we can do when we see them */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!sb_has_quota_loaded(sb, type))
+                        continue;
+                inode = igrab(sb->s_dquot.files[type]);
+                /* Turn off quotas. This will remove all dquot structures from
+                 * memory and so they will be automatically synced to global
+                 * quota files */
+                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                                            DQUOT_LIMITS_ENABLED);
+                if (!inode)
+                        continue;
+                iput(inode);
+        }
+}
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+                          char *path, int remount)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                return -EINVAL;
+        if (remount)
+                return 0;       /* Just ignore it has been handled in
+                                 * ocfs2_remount() */
+        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+                                    format_id, DQUOT_LIMITS_ENABLED);
+}
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+        if (remount)
+                return 0;       /* Ignore now and handle later in
+                                 * ocfs2_remount() */
+        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+static struct quotactl_ops ocfs2_quotactl_ops = {
+        .quota_on       = ocfs2_quota_on,
+        .quota_off      = ocfs2_quota_off,
+        .quota_sync     = vfs_quota_sync,
+        .get_info       = vfs_get_dqinfo,
+        .set_info       = vfs_set_dqinfo,
+        .get_dqblk      = vfs_get_dqblk,
+        .set_dqblk      = vfs_set_dqblk,
+};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        }
        brelse(bh);
        bh = NULL;
+        if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "User quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "Group quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = OCFS2_SUPER_MAGIC;
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
         * heartbeat=none */
        if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        atomic_set(&osb->vol_state, VOLUME_MOUNTED);
        wake_up(&osb->osb_mount_event);
+        /* Now we can initialize quotas because we can afford to wait
+         * for cluster locks recovery now. That also means that truncation
+         * log recovery can happen but that waits for proper quota setup */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                status = ocfs2_enable_quotas(osb);
+                if (status < 0) {
+                        /* We have to err-out specially here because
+                         * s_root is already set */
+                        mlog_errno(status);
+                        atomic_set(&osb->vol_state, VOLUME_DISABLED);
+                        wake_up(&osb->osb_mount_event);
+                        mlog_exit(status);
+                        return status;
+                }
+        }
+        ocfs2_complete_quota_recovery(osb);
+        /* Now we wake up again for processes waiting for quotas */
+        atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+        wake_up(&osb->osb_mount_event);
        mlog_exit(status);
        return status;
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
                        break;
+                case Opt_usrquota:
+                        /* We check only on remount, otherwise features
+                         * aren't yet initialized. */
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                                mlog(ML_ERROR, "User quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+                        break;
+                case Opt_grpquota:
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                                mlog(ML_ERROR, "Group quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+                        break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+                case Opt_acl:
+                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                        break;
+                case Opt_noacl:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+                        break;
+#else
+                case Opt_acl:
+                case Opt_noacl:
+                        printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+                        break;
+#endif
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->osb_cluster_stack[0])
                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
                           osb->osb_cluster_stack);
+        if (opts & OCFS2_MOUNT_USRQUOTA)
+                seq_printf(s, ",usrquota");
+        if (opts & OCFS2_MOUNT_GRPQUOTA)
+                seq_printf(s, ",grpquota");
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_INODE64)
                seq_printf(s, ",inode64");
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        if (opts & OCFS2_MOUNT_POSIX_ACL)
+                seq_printf(s, ",acl");
+        else
+                seq_printf(s, ",noacl");
+#endif
        return 0;
 }
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
+        status = ocfs2_quota_setup();
+        if (status)
+                goto leave;
        ocfs2_set_locking_protocol();
+        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
+                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
+        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
        }
+        unregister_quota_format(&ocfs2_quota_format);
        debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
                                       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                                SLAB_MEM_SPREAD),
                                       ocfs2_inode_init_once);
-        if (!ocfs2_inode_cachep)
+        ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+                                        sizeof(struct ocfs2_dquot),
+                                        0,
+                                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+                                                SLAB_MEM_SPREAD),
+                                        NULL);
+        ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+                                        sizeof(struct ocfs2_quota_chunk),
+                                        0,
+                                        (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                        NULL);
+        if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+            !ocfs2_qf_chunk_cachep) {
+                if (ocfs2_inode_cachep)
+                        kmem_cache_destroy(ocfs2_inode_cachep);
+                if (ocfs2_dquot_cachep)
+                        kmem_cache_destroy(ocfs2_dquot_cachep);
+                if (ocfs2_qf_chunk_cachep)
+                        kmem_cache_destroy(ocfs2_qf_chunk_cachep);
                return -ENOMEM;
+        }
        return 0;
 }
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
 {
        if (ocfs2_inode_cachep)
                kmem_cache_destroy(ocfs2_inode_cachep);
        ocfs2_inode_cachep = NULL;
+        if (ocfs2_dquot_cachep)
+                kmem_cache_destroy(ocfs2_dquot_cachep);
+        ocfs2_dquot_cachep = NULL;
+        if (ocfs2_qf_chunk_cachep)
+                kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+        ocfs2_qf_chunk_cachep = NULL;
 }
 static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
        ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
+        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->dq_op = &ocfs2_quota_operations;
        sb->s_xattr = ocfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
        if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
                   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+                /* We have to do a raw check of the feature here */
+                if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+                    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+                        status = ocfs2_block_check_validate(bh->b_data,
+                                                            bh->b_size,
+                                                            &di->i_check);
+                        if (status)
+                                goto out;
+                }
                status = -EINVAL;
                if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
                        mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                }
        }
+out:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        mlog_entry_void();
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+        status = ocfs2_read_inode_block(inode, bh);
        if (status < 0) {
                mlog_errno(status);
                link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..e1d638af6ac3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/security.h>
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
 };
 struct ocfs2_xattr_bucket {
-        struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* The inode these xattrs are associated with */
-        struct ocfs2_xattr_header *xh;
+        struct inode *bu_inode;
+        /* The actual buffers that make up the bucket */
+        struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* How many blocks make up one bucket for this filesystem */
+        int bu_blocks;
+};
+struct ocfs2_xattr_set_ctxt {
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_FREE_IN_IBODY       (OCFS2_MIN_XATTR_INLINE_SIZE \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)  ((ptr)->i_sb->s_blocksize \
+                                         - sizeof(struct ocfs2_xattr_block) \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
 static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        &ocfs2_xattr_acl_access_handler,
+        &ocfs2_xattr_acl_default_handler,
+#endif
        &ocfs2_xattr_trusted_handler,
+        &ocfs2_xattr_security_handler,
        NULL
 };
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+                                        = &ocfs2_xattr_acl_access_handler,
+        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+                                        = &ocfs2_xattr_acl_default_handler,
+#endif
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
+        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
 struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
         */
        struct buffer_head *xattr_bh;
        struct ocfs2_xattr_header *header;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
        void *base;
        void *end;
        struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
                                        size_t buffer_size);
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs);
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs);
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
                                          struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
+                                  unsigned int start_bucket,
+                                  u32 *first_hash);
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
        return len / sizeof(struct ocfs2_xattr_entry);
 }
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+        struct ocfs2_xattr_bucket *bucket;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+        bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+        if (bucket) {
+                bucket->bu_inode = inode;
+                bucket->bu_blocks = blks;
+        }
+        return bucket;
+}
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                brelse(bucket->bu_bhs[i]);
+                bucket->bu_bhs[i] = NULL;
+        }
+}
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+        if (bucket) {
+                ocfs2_xattr_bucket_relse(bucket);
+                bucket->bu_inode = NULL;
+                kfree(bucket);
+        }
+}
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+                                              xb_blkno + i);
+                if (!bucket->bu_bhs[i]) {
+                        rc = -EIO;
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+                                           bucket->bu_bhs[i]))
+                        ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+                                                      bucket->bu_bhs[i]);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int rc;
+        rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+                               bucket->bu_blocks, bucket->bu_bhs, 0,
+                               NULL);
+        if (!rc) {
+                rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                                 bucket->bu_bhs,
+                                                 bucket->bu_blocks,
+                                                 &bucket_xh(bucket)->xh_check);
+                if (rc)
+                        mlog_errno(rc);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket,
+                                             int type)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                rc = ocfs2_journal_access(handle, bucket->bu_inode,
+                                          bucket->bu_bhs[i], type);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+        }
+        return rc;
+}
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                   bucket->bu_bhs, bucket->bu_blocks,
+                                   &bucket_xh(bucket)->xh_check);
+        for (i = 0; i < bucket->bu_blocks; i++)
+                ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+                                         struct ocfs2_xattr_bucket *src)
+{
+        int i;
+        int blocksize = src->bu_inode->i_sb->s_blocksize;
+        BUG_ON(dest->bu_blocks != src->bu_blocks);
+        BUG_ON(dest->bu_inode != src->bu_inode);
+        for (i = 0; i < src->bu_blocks; i++) {
+                memcpy(bucket_block(dest, i), bucket_block(src, i),
+                       blocksize);
+        }
+}
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_xattr_block *xb =
+                (struct ocfs2_xattr_block *)bh->b_data;
+        mlog(0, "Validating xattr block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal
+         */
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has bad "
+                            "signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            xb->xb_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an "
+                            "invalid xb_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(xb->xb_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an invalid "
+                            "xb_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(xb->xb_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+                                  struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+                              ocfs2_validate_xattr_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
        struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
        return;
 }
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+        int size = 0;
+        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+        else
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        size += sizeof(struct ocfs2_xattr_entry);
+        return size;
+}
+int ocfs2_calc_security_init(struct inode *dir,
+                             struct ocfs2_security_xattr_info *si,
+                             int *want_clusters,
+                             int *xattr_credits,
+                             struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                 si->value_len);
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * So reserve one metadata block for it is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        /* reserve clusters for xattr value which will be set in B tree*/
+        if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                            si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
+int ocfs2_calc_xattr_init(struct inode *dir,
+                          struct buffer_head *dir_bh,
+                          int mode,
+                          struct ocfs2_security_xattr_info *si,
+                          int *want_clusters,
+                          int *xattr_credits,
+                          struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+        if (si->enable)
+                s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                     si->value_len);
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+                                        OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+                                        "", NULL, 0);
+                if (acl_len > 0) {
+                        a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+                        if (S_ISDIR(mode))
+                                a_size <<= 1;
+                } else if (acl_len != 0 && acl_len != -ENODATA) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+        }
+        if (!(s_size + a_size))
+                return ret;
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * The max space of acl xattr taken inline is
+         * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+         * when blocksize = 512, may reserve one more cluser for
+         * xattr bucket, otherwise reserve one metadata block
+         * for them is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+                *want_clusters += 1;
+                *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+        }
+        /*
+         * reserve credits and clusters for xattrs which has large value
+         * and have to be set outside
+         */
+        if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                        si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+            acl_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* for directory, it has DEFAULT and ACCESS two types of acls */
+                new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+                                ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         u32 clusters_to_add,
-                                         struct buffer_head *xattr_bh,
+                                         struct ocfs2_xattr_value_buf *vb,
-                                         struct ocfs2_xattr_value_root *xv)
+                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int status = 0;
-        int restart_func = 0;
+        handle_t *handle = ctxt->handle;
-        int credits = 0;
-        handle_t *handle = NULL;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
        struct ocfs2_extent_tree et;
        mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-        ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-restart_all:
-        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-                                       &data_ac, &meta_ac);
-        if (status) {
-                mlog_errno(status);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto leave;
-        }
-restarted_transaction:
+        status = vb->vb_access(handle, inode, vb->vb_bh,
-        status = ocfs2_journal_access(handle, inode, xattr_bh,
+                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        prev_clusters = le32_to_cpu(xv->xr_clusters);
+        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        status = ocfs2_add_clusters_in_btree(osb,
                                             inode,
                                             &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
                                             0,
                                             &et,
                                             handle,
-                                             data_ac,
+                                             ctxt->data_ac,
-                                             meta_ac,
+                                             ctxt->meta_ac,
                                             &why);
-        if ((status < 0) && (status != -EAGAIN)) {
+        if (status < 0) {
-                if (status != -ENOSPC)
+                mlog_errno(status);
-                        mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, xattr_bh);
+        status = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
-        if (why != RESTART_NONE && clusters_to_add) {
+        /*
-                if (why == RESTART_META) {
+         * We should have already allocated enough space before the transaction,
-                        mlog(0, "restarting function.\n");
+         * so no need to restart.
-                        restart_func = 1;
+         */
-                } else {
+        BUG_ON(why != RESTART_NONE || clusters_to_add);
-                        BUG_ON(why != RESTART_TRANS);
-                        mlog(0, "restarting transaction.\n");
-                        /* TODO: This can be more intelligent. */
-                        credits = ocfs2_calc_extend_credits(osb->sb,
-                                                            et.et_root_el,
-                                                            clusters_to_add);
-                        status = ocfs2_extend_trans(handle, credits);
-                        if (status < 0) {
-                                /* handle still has to be committed at
-                                 * this point. */
-                                status = -ENOMEM;
-                                mlog_errno(status);
-                                goto leave;
-                        }
-                        goto restarted_transaction;
-                }
-        }
 leave:
-        if (handle) {
-                ocfs2_commit_trans(osb, handle);
-                handle = NULL;
-        }
-        if (data_ac) {
-                ocfs2_free_alloc_context(data_ac);
-                data_ac = NULL;
-        }
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if ((!status) && restart_func) {
-                restart_func = 0;
-                goto restart_all;
-        }
        return status;
 }
 static int __ocfs2_remove_xattr_range(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = ctxt->handle;
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_extent_tree et;
-        ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
+                            OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                  &ctxt->dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        le32_add_cpu(&xv->xr_clusters, -len);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
        if (ret)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 static int ocfs2_xattr_shrink_size(struct inode *inode,
                                   u32 old_clusters,
                                   u32 new_clusters,
-                                   struct buffer_head *root_bh,
+                                   struct ocfs2_xattr_value_buf *vb,
-                                   struct ocfs2_xattr_value_root *xv)
+                                   struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret = 0;
        u32 trunc_len, cpos, phys_cpos, alloc_size;
        u64 block;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_cached_dealloc_ctxt dealloc;
-        ocfs2_init_dealloc_ctxt(&dealloc);
        if (old_clusters <= new_clusters)
                return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        trunc_len = old_clusters - new_clusters;
        while (trunc_len) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
-                                               &alloc_size, &xv->xr_list);
+                                               &alloc_size,
+                                               &vb->vb_xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
                if (alloc_size > trunc_len)
                        alloc_size = trunc_len;
-                ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+                ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
                                                 phys_cpos, alloc_size,
-                                                 &dealloc);
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        }
 out:
-        ocfs2_schedule_truncate_log_flush(osb, 1);
-        ocfs2_run_deallocs(osb, &dealloc);
        return ret;
 }
 static int ocfs2_xattr_value_truncate(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
+                                      int len,
-                                      int len)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
-        u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+        u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        if (new_clusters == old_clusters)
                return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
        if (new_clusters > old_clusters)
                ret = ocfs2_xattr_extend_allocation(inode,
                                                    new_clusters - old_clusters,
-                                                    root_bh, xv);
+                                                    vb, ctxt);
        else
                ret = ocfs2_xattr_shrink_size(inode,
                                              old_clusters, new_clusters,
-                                              root_bh, xv);
+                                              vb, ctxt);
        return ret;
 }
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
                ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
                ret = ocfs2_xattr_tree_list_index_block(inode, xt,
                                                   buffer, buffer_size);
        }
-cleanup:
        brelse(blk_bh);
        return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                /* Copy ocfs2_xattr_value */
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        size_t size;
        int ret = -ENODATA, name_offset, name_len, block_off, i;
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
+        xs->bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xs->bucket) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto cleanup;
+        }
        ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
        if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                xs->bucket.xh,
+                                                                bucket_xh(xs->bucket),
                                                                i,
                                                                &block_off,
                                                                &name_offset);
-                        xs->base = xs->bucket.bhs[block_off]->b_data;
+                        xs->base = bucket_block(xs->bucket, block_off);
                }
                if (ocfs2_xattr_is_local(xs->here)) {
                        memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        }
        ret = size;
 cleanup:
-        for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+        ocfs2_xattr_bucket_free(xs->bucket);
-                brelse(xs->bucket.bhs[i]);
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
        brelse(xs->xattr_bh);
        xs->xattr_bh = NULL;
        return ret;
 }
-/* ocfs2_xattr_get()
+int ocfs2_xattr_get_nolock(struct inode *inode,
- *
+                           struct buffer_head *di_bh,
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
                           int name_index,
                           const char *name,
                           void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 {
        int ret;
        struct ocfs2_dinode *di = NULL;
-        struct buffer_head *di_bh = NULL;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_xattr_search xis = {
                .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
                ret = -ENODATA;
-        ret = ocfs2_inode_lock(inode, &di_bh, 0);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
        up_read(&oi->ip_xattr_sem);
+        return ret;
+}
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+                           int name_index,
+                           const char *name,
+                           void *buffer,
+                           size_t buffer_size)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                     name, buffer, buffer_size);
        ocfs2_inode_unlock(inode, 0);
        brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
 }
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_value_root *xv,
                                           const void *value,
                                           int value_len)
 {
-        int ret = 0, i, cp_len, credits;
+        int ret = 0, i, cp_len;
        u16 blocksize = inode->i_sb->s_blocksize;
        u32 p_cluster, num_clusters;
        u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
        u64 blkno;
        struct buffer_head *bh = NULL;
-        handle_t *handle;
        BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
-        credits = clusters * bpc;
-        handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
        while (cpos < clusters) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
                                               &num_clusters, &xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                        ret = ocfs2_journal_dirty(handle, bh);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        brelse(bh);
                        bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                }
                cpos += num_clusters;
        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        brelse(bh);
@@ -960,28 +1249,22 @@ out:
 }
 static int ocfs2_xattr_cleanup(struct inode *inode,
+                               handle_t *handle,
                               struct ocfs2_xattr_info *xi,
                               struct ocfs2_xattr_search *xs,
+                               struct ocfs2_xattr_value_buf *vb,
                               size_t offs)
 {
-        handle_t *handle = NULL;
        int ret = 0;
        size_t name_len = strlen(xi->name);
        void *val = xs->base + offs;
        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /* Decrease xattr count */
        le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
        memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
        memset(val, 0, size);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
 static int ocfs2_xattr_update_entry(struct inode *inode,
+                                    handle_t *handle,
                                    struct ocfs2_xattr_info *xi,
                                    struct ocfs2_xattr_search *xs,
+                                    struct ocfs2_xattr_value_buf *vb,
                                    size_t offs)
 {
-        handle_t *handle = NULL;
+        int ret;
-        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
                ocfs2_xattr_set_local(xs->here, 0);
        ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
@@ -1045,6 +1318,8 @@ out:
 static int ocfs2_xattr_set_value_outside(struct inode *inode,
                                         struct ocfs2_xattr_info *xi,
                                         struct ocfs2_xattr_search *xs,
+                                         struct ocfs2_xattr_set_ctxt *ctxt,
+                                         struct ocfs2_xattr_value_buf *vb,
                                         size_t offs)
 {
        size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
        xv->xr_list.l_tree_depth = 0;
        xv->xr_list.l_count = cpu_to_le16(1);
        xv->xr_list.l_next_free_rec = 0;
+        vb->vb_xv = xv;
-        ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+        ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
-                                         xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+        ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-                                              xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+        ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+                                              xi->value, xi->value_len);
        if (ret < 0)
                mlog_errno(ret);
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 static int ocfs2_xattr_set_entry(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt,
                                 int flag)
 {
        struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
        size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
        size_t size_l = 0;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
        int free, i, ret;
        struct ocfs2_xattr_info xi_l = {
                .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                .value = xi->value,
                .value_len = xi->value_len,
        };
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = xs->xattr_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
+        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+                BUG_ON(xs->xattr_bh == xs->inode_bh);
+                vb.vb_access = ocfs2_journal_access_xb;
+        } else
+                BUG_ON(xs->xattr_bh != xs->inode_bh);
        /* Compute min_offs, last and free space. */
        last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
                        /* Replace existing local xattr with tree root */
                        ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-                                                            offs);
+                                                            ctxt, &vb, offs);
                        if (ret < 0)
                                mlog_errno(ret);
                        goto out;
                } else if (!ocfs2_xattr_is_local(xs->here)) {
                        /* For existing xattr which has value outside */
-                        struct ocfs2_xattr_value_root *xv = NULL;
+                        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                        xv = (struct ocfs2_xattr_value_root *)(val +
+                                (val + OCFS2_XATTR_SIZE(name_len));
-                                OCFS2_XATTR_SIZE(name_len));
                        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                                /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * then set new value with set_value_outside().
                                 */
                                ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                 &vb,
-                                                                 xv,
+                                                                 xi->value_len,
-                                                                 xi->value_len);
+                                                                 ctxt);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = __ocfs2_xattr_set_value_outside(inode,
+                                ret = ocfs2_xattr_update_entry(inode,
-                                                                xv,
+                                                               handle,
-                                                                xi->value,
+                                                               xi,
-                                                                xi->value_len);
+                                                               xs,
+                                                               &vb,
+                                                               offs);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = ocfs2_xattr_update_entry(inode,
+                                ret = __ocfs2_xattr_set_value_outside(inode,
-                                                               xi,
+                                                                handle,
-                                                               xs,
+                                                                vb.vb_xv,
-                                                               offs);
+                                                                xi->value,
+                                                                xi->value_len);
                                if (ret < 0)
                                        mlog_errno(ret);
                                goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * just trucate old value to zero.
                                 */
                                 ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                  &vb,
-                                                                 xv,
+                                                                  0,
-                                                                 0);
+                                                                  ctxt);
                                if (ret < 0)
                                        mlog_errno(ret);
                        }
                }
        }
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                                   OCFS2_INODE_UPDATE_CREDITS);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-                /* set extended attribute in external block. */
+                ret = vb.vb_access(handle, inode, vb.vb_bh,
-                ret = ocfs2_extend_trans(handle,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
-                                         OCFS2_INODE_UPDATE_CREDITS +
-                                         OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        oi->ip_dyn_features |= flag;
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        /* Update inode ctime */
-        inode->i_ctime = CURRENT_TIME;
-        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ret = ocfs2_journal_dirty(handle, xs->inode_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
        if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                /*
                 * Set value outside in B tree.
                 * This is the second step for value size > INLINE_SIZE.
                 */
                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+                                                    &vb, offs);
                if (ret < 0) {
                        int ret2;
@@ -1418,41 +1684,56 @@ out_commit:
                         * If set value outside failed, we have to clean
                         * the junk tree root we have already set in local.
                         */
-                        ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+                        ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+                                                   xi, xs, &vb, offs);
                        if (ret2 < 0)
                                mlog_errno(ret2);
                }
        }
 out:
        return ret;
 }
 static int ocfs2_remove_value_outside(struct inode*inode,
-                                      struct buffer_head *bh,
+                                      struct ocfs2_xattr_value_buf *vb,
                                      struct ocfs2_xattr_header *header)
 {
        int ret = 0, i;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+        ctxt.handle = ocfs2_start_trans(osb,
+                                        ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto out;
+        }
        for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
                struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
                if (!ocfs2_xattr_is_local(entry)) {
-                        struct ocfs2_xattr_value_root *xv;
                        void *val;
                        val = (void *)header +
                                le16_to_cpu(entry->xe_name_offset);
-                        xv = (struct ocfs2_xattr_value_root *)
+                        vb->vb_xv = (struct ocfs2_xattr_value_root *)
                                (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-                        ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+                        ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                return ret;
+                                break;
                        }
                }
        }
+        ocfs2_commit_trans(osb, ctxt.handle);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
        return ret;
 }
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_xattr_header *header;
        int ret;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = di_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
        header = (struct ocfs2_xattr_header *)
                 ((void *)di + inode->i_sb->s_blocksize -
                 le16_to_cpu(di->i_xattr_inline_size));
-        ret = ocfs2_remove_value_outside(inode, di_bh, header);
+        ret = ocfs2_remove_value_outside(inode, &vb, header);
        return ret;
 }
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 {
        struct ocfs2_xattr_block *xb;
        int ret = 0;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = blk_bh,
+                .vb_access = ocfs2_journal_access_xb,
+        };
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-                ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+                ret = ocfs2_remove_value_outside(inode, &vb, header);
        } else
                ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        u64 blk, bg_blkno;
        u16 bit;
-        ret = ocfs2_read_block(inode, block, &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto out;
-        }
        ret = ocfs2_xattr_block_remove(inode, blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
 */
 static int ocfs2_xattr_ibody_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
                }
        }
-        ret = ocfs2_xattr_set_entry(inode, xi, xs,
+        ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
                                (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
 out:
        up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        xs->xattr_bh = blk_bh;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
 */
 static int ocfs2_xattr_block_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct buffer_head *new_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = ctxt->handle;
-        handle_t *handle = NULL;
        struct ocfs2_xattr_block *xblk = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
        int ret;
        if (!xs->xattr_bh) {
-                /*
+                ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                 * Alloc one external block for extended attribute
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
-                 * outside of inode.
-                 */
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out;
+                        goto end;
-                }
-                handle = ocfs2_start_trans(osb,
-                                           OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out_commit;
                }
-                ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+                ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
                                           &suballoc_bit_start, &num_got,
                                           &first_blkno);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                new_bh = sb_getblk(inode->i_sb, first_blkno);
                ocfs2_set_new_buffer_uptodate(inode, new_bh);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                ret = ocfs2_journal_access_xb(handle, inode, new_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                xs->end = (void *)xblk + inode->i_sb->s_blocksize;
                xs->here = xs->header->xh_entries;
                ret = ocfs2_journal_dirty(handle, new_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                di->i_xattr_loc = cpu_to_le64(first_blkno);
-                ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+                ocfs2_journal_dirty(handle, xs->inode_bh);
-                if (ret < 0)
-                        mlog_errno(ret);
-out_commit:
-                ocfs2_commit_trans(osb, handle);
-out:
-                if (meta_ac)
-                        ocfs2_free_alloc_context(meta_ac);
-                if (ret < 0)
-                        return ret;
        } else
                xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
        if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
                /* Set extended attribute into external block */
-                ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+                ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+                                            OCFS2_HAS_XATTR_FL);
                if (!ret || ret != -ENOSPC)
                        goto end;
-                ret = ocfs2_xattr_create_index_block(inode, xs);
+                ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
                if (ret)
                        goto end;
        }
-        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 end:
        return ret;
 }
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+                                       struct ocfs2_xattr_info *xi,
+                                       struct ocfs2_xattr_search *xs)
+{
+        u64 value_size;
+        struct ocfs2_xattr_entry *last;
+        int free, i;
+        size_t min_offs = xs->end - xs->base;
+        if (!xs->header)
+                return 0;
+        last = xs->header->xh_entries;
+        for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+                size_t offs = le16_to_cpu(last->xe_name_offset);
+                if (offs < min_offs)
+                        min_offs = offs;
+                last += 1;
+        }
+        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        if (free < 0)
+                return 0;
+        BUG_ON(!xs->not_found);
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                value_size = OCFS2_XATTR_ROOT_SIZE;
+        else
+                value_size = OCFS2_XATTR_SIZE(xi->value_len);
+        if (free >= sizeof(struct ocfs2_xattr_entry) +
+                   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+                return 1;
+        return 0;
+}
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     int *clusters_need,
+                                     int *meta_need,
+                                     int *credits_need)
+{
+        int ret = 0, old_in_xb = 0;
+        int clusters_add = 0, meta_add = 0, credits = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_block *xb = NULL;
+        struct ocfs2_xattr_entry *xe = NULL;
+        struct ocfs2_xattr_value_root *xv = NULL;
+        char *base = NULL;
+        int name_offset, name_len = 0;
+        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+                                                    xi->value_len);
+        u64 value_size;
+        /*
+         * Calculate the clusters we need to write.
+         * No matter whether we replace an old one or add a new one,
+         * we need this for writing.
+         */
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                credits += new_clusters *
+                           ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        if (xis->not_found && xbs->not_found) {
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                }
+                goto meta_guess;
+        }
+        if (!xis->not_found) {
+                xe = xis->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                base = xis->base;
+                credits += OCFS2_INODE_UPDATE_CREDITS;
+        } else {
+                int i, block_off = 0;
+                xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                xe = xbs->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                i = xbs->here - xbs->header->xh_entries;
+                old_in_xb = 1;
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                        bucket_xh(xbs->bucket),
+                                                        i, &block_off,
+                                                        &name_offset);
+                        base = bucket_block(xbs->bucket, block_off);
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                } else {
+                        base = xbs->base;
+                        credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+                }
+        }
+        /*
+         * delete a xattr doesn't need metadata and cluster allocation.
+         * so just calculate the credits and return.
+         *
+         * The credits for removing the value tree will be extended
+         * by ocfs2_remove_extent itself.
+         */
+        if (!xi->value) {
+                if (!ocfs2_xattr_is_local(xe))
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                goto out;
+        }
+        /* do cluster allocation guess first. */
+        value_size = le64_to_cpu(xe->xe_value_size);
+        if (old_in_xb) {
+                /*
+                 * In xattr set, we always try to set the xe in inode first,
+                 * so if it can be inserted into inode successfully, the old
+                 * one will be removed from the xattr block, and this xattr
+                 * will be inserted into inode as a new xattr in inode.
+                 */
+                if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_remove_extent_credits(inode->i_sb) +
+                                    OCFS2_INODE_UPDATE_CREDITS;
+                        if (!ocfs2_xattr_is_local(xe))
+                                credits += ocfs2_calc_extend_credits(
+                                                        inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                        goto out;
+                }
+        }
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* the new values will be stored outside. */
+                u32 old_clusters = 0;
+                if (!ocfs2_xattr_is_local(xe)) {
+                        old_clusters =  ocfs2_clusters_for_bytes(inode->i_sb,
+                                                                 value_size);
+                        xv = (struct ocfs2_xattr_value_root *)
+                             (base + name_offset + name_len);
+                        value_size = OCFS2_XATTR_ROOT_SIZE;
+                } else
+                        xv = &def_xv.xv;
+                if (old_clusters >= new_clusters) {
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                        goto out;
+                } else {
+                        meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+                        clusters_add += new_clusters - old_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             &xv->xr_list,
+                                                             new_clusters -
+                                                             old_clusters);
+                        if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+                                goto out;
+                }
+        } else {
+                /*
+                 * Now the new value will be stored inside. So if the new
+                 * value is smaller than the size of value root or the old
+                 * value, we don't need any allocation, otherwise we have
+                 * to guess metadata allocation.
+                 */
+                if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+                    (!ocfs2_xattr_is_local(xe) &&
+                     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+                        goto out;
+        }
+meta_guess:
+        /* calculate metadata allocation. */
+        if (di->i_xattr_loc) {
+                if (!xbs->xattr_bh) {
+                        ret = ocfs2_read_xattr_block(inode,
+                                                     le64_to_cpu(di->i_xattr_loc),
+                                                     &bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        xb = (struct ocfs2_xattr_block *)bh->b_data;
+                } else
+                        xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                /*
+                 * If there is already an xattr tree, good, we can calculate
+                 * like other b-trees. Otherwise we may have the chance of
+                 * create a tree, the credit calculation is borrowed from
+                 * ocfs2_calc_extend_credits with root_el = NULL. And the
+                 * new tree will be cluster based, so no meta is needed.
+                 */
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        struct ocfs2_extent_list *el =
+                                 &xb->xb_attrs.xb_root.xt_list;
+                        meta_add += ocfs2_extend_meta_needed(el);
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             el, 1);
+                } else
+                        credits += OCFS2_SUBALLOC_ALLOC + 1;
+                /*
+                 * This cluster will be used either for new bucket or for
+                 * new xattr block.
+                 * If the cluster size is the same as the bucket size, one
+                 * more is needed since we may need to extend the bucket
+                 * also.
+                 */
+                clusters_add += 1;
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (OCFS2_XATTR_BUCKET_SIZE ==
+                        OCFS2_SB(inode->i_sb)->s_clustersize) {
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                        clusters_add += 1;
+                }
+        } else {
+                meta_add += 1;
+                credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+out:
+        if (clusters_need)
+                *clusters_need = clusters_add;
+        if (meta_need)
+                *meta_need = meta_add;
+        if (credits_need)
+                *credits_need = credits;
+        brelse(bh);
+        return ret;
+}
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt,
+                                     int *credits)
+{
+        int clusters_add, meta_add, ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+        ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+        ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+                                        &clusters_add, &meta_add, credits);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+             "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+        if (meta_add) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+                                                        &ctxt->meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (clusters_add) {
+                ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+                if (ret)
+                        mlog_errno(ret);
+        }
+out:
+        if (ret) {
+                if (ctxt->meta_ac) {
+                        ocfs2_free_alloc_context(ctxt->meta_ac);
+                        ctxt->meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null ctxt->data_ac.
+                 */
+        }
+        return ret;
+}
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+                                    struct ocfs2_dinode *di,
+                                    struct ocfs2_xattr_info *xi,
+                                    struct ocfs2_xattr_search *xis,
+                                    struct ocfs2_xattr_search *xbs,
+                                    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret = 0, credits, old_found;
+        if (!xi->value) {
+                /* Remove existing extended attribute */
+                if (!xis->not_found)
+                        ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                else if (!xbs->not_found)
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+        } else {
+                /* We always try to set extended attribute into inode first*/
+                ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                if (!ret && !xbs->not_found) {
+                        /*
+                         * If succeed and that extended attribute existing in
+                         * external block, then we will remove it.
+                         */
+                        xi->value = NULL;
+                        xi->value_len = 0;
+                        old_found = xis->not_found;
+                        xis->not_found = -ENODATA;
+                        ret = ocfs2_calc_xattr_set_need(inode,
+                                                        di,
+                                                        xi,
+                                                        xis,
+                                                        xbs,
+                                                        NULL,
+                                                        NULL,
+                                                        &credits);
+                        xis->not_found = old_found;
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                } else if (ret == -ENOSPC) {
+                        if (di->i_xattr_loc && !xbs->xattr_bh) {
+                                ret = ocfs2_xattr_block_find(inode,
+                                                             xi->name_index,
+                                                             xi->name, xbs);
+                                if (ret)
+                                        goto out;
+                                old_found = xis->not_found;
+                                xis->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                xis->not_found = old_found;
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                        }
+                        /*
+                         * If no space in inode, we will set extended attribute
+                         * into external block.
+                         */
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                        if (ret)
+                                goto out;
+                        if (!xis->not_found) {
+                                /*
+                                 * If succeed and that extended attribute
+                                 * existing in inode, we will remove it.
+                                 */
+                                xi->value = NULL;
+                                xi->value_len = 0;
+                                xbs->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                                ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_xattr_ibody_set(inode, xi,
+                                                            xis, ctxt);
+                        }
+                }
+        }
+        if (!ret) {
+                /* Update inode ctime. */
+                ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                inode->i_ctime = CURRENT_TIME;
+                di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+                di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+        }
+out:
+        return ret;
+}
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+                           struct inode *inode,
+                           struct buffer_head *di_bh,
+                           int name_index,
+                           const char *name,
+                           const void *value,
+                           size_t value_len,
+                           int flags,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_dinode *di;
+        int ret;
+        struct ocfs2_xattr_info xi = {
+                .name_index = name_index,
+                .name = name,
+                .value = value,
+                .value_len = value_len,
+        };
+        struct ocfs2_xattr_search xis = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_search xbs = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_set_ctxt ctxt = {
+                .handle = handle,
+                .meta_ac = meta_ac,
+                .data_ac = data_ac,
+        };
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return -EOPNOTSUPP;
+        /*
+         * In extreme situation, may need xattr bucket when
+         * block size is too small. And we have already reserved
+         * the credits for bucket in mknod.
+         */
+        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+                xbs.bucket = ocfs2_xattr_bucket_new(inode);
+                if (!xbs.bucket) {
+                        mlog_errno(-ENOMEM);
+                        return -ENOMEM;
+                }
+        }
+        xis.inode_bh = xbs.inode_bh = di_bh;
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_write(&OCFS2_I(inode)->ip_xattr_sem);
+        ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+        if (ret)
+                goto cleanup;
+        if (xis.not_found) {
+                ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+                if (ret)
+                        goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+cleanup:
+        up_write(&OCFS2_I(inode)->ip_xattr_sem);
+        brelse(xbs.xattr_bh);
+        ocfs2_xattr_bucket_free(xbs.bucket);
+        return ret;
+}
 /*
 * ocfs2_xattr_set()
 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
 {
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        int ret;
+        int ret, credits;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
        struct ocfs2_xattr_info xi = {
                .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
                return -EOPNOTSUPP;
+        /*
+         * Only xbs will be used on indexed trees.  xis doesn't need a
+         * bucket.
+         */
+        xbs.bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xbs.bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
-                return ret;
+                goto cleanup_nolock;
        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
                        goto cleanup;
        }
-        if (!value) {
-                /* Remove existing extended attribute */
+        mutex_lock(&tl_inode->i_mutex);
-                if (!xis.not_found)
-                        ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
-                else if (!xbs.not_found)
+                ret = __ocfs2_flush_truncate_log(osb);
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+                if (ret < 0) {
-        } else {
+                        mutex_unlock(&tl_inode->i_mutex);
-                /* We always try to set extended attribute into inode first*/
+                        mlog_errno(ret);
-                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                        goto cleanup;
-                if (!ret && !xbs.not_found) {
-                        /*
-                         * If succeed and that extended attribute existing in
-                         * external block, then we will remove it.
-                         */
-                        xi.value = NULL;
-                        xi.value_len = 0;
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                } else if (ret == -ENOSPC) {
-                        if (di->i_xattr_loc && !xbs.xattr_bh) {
-                                ret = ocfs2_xattr_block_find(inode, name_index,
-                                                             name, &xbs);
-                                if (ret)
-                                        goto cleanup;
-                        }
-                        /*
-                         * If no space in inode, we will set extended attribute
-                         * into external block.
-                         */
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                        if (ret)
-                                goto cleanup;
-                        if (!xis.not_found) {
-                                /*
-                                 * If succeed and that extended attribute
-                                 * existing in inode, we will remove it.
-                                 */
-                                xi.value = NULL;
-                                xi.value_len = 0;
-                                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-                        }
                }
        }
+        mutex_unlock(&tl_inode->i_mutex);
+        ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+                                        &xbs, &ctxt, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        /* we need to update inode's ctime field, so add credit for it. */
+        credits += OCFS2_INODE_UPDATE_CREDITS;
+        ctxt.handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+        ocfs2_commit_trans(osb, ctxt.handle);
+        if (ctxt.data_ac)
+                ocfs2_free_alloc_context(ctxt.data_ac);
+        if (ctxt.meta_ac)
+                ocfs2_free_alloc_context(ctxt.meta_ac);
+        if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+                ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
        up_write(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
        brelse(di_bh);
        brelse(xbs.xattr_bh);
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(xbs.bucket);
-                brelse(xbs.bucket.bhs[i]);
        return ret;
 }
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
                                void *para);
 static int ocfs2_find_xe_in_bucket(struct inode *inode,
-                                   struct buffer_head *header_bh,
+                                   struct ocfs2_xattr_bucket *bucket,
                                   int name_index,
                                   const char *name,
                                   u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                                   int *found)
 {
        int i, ret = 0, cmp = 1, block_off, new_offset;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t name_len = strlen(name);
        struct ocfs2_xattr_entry *xe = NULL;
-        struct buffer_head *name_bh = NULL;
        char *xe_name;
        /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                        break;
                }
-                ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
-                                       &name_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
-                xe_name = name_bh->b_data + new_offset;
-                cmp = memcmp(name, xe_name, name_len);
+                xe_name = bucket_block(bucket, block_off) + new_offset;
-                brelse(name_bh);
+                if (!memcmp(name, xe_name, name_len)) {
-                name_bh = NULL;
-                if (cmp == 0) {
                        *xe_index = i;
                        *found = 1;
                        ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                                   struct ocfs2_xattr_search *xs)
 {
        int ret, found = 0;
-        struct buffer_head *bh = NULL;
-        struct buffer_head *lower_bh = NULL;
        struct ocfs2_xattr_header *xh = NULL;
        struct ocfs2_xattr_entry *xe = NULL;
        u16 index = 0;
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int low_bucket = 0, bucket, high_bucket;
+        struct ocfs2_xattr_bucket *search;
        u32 last_hash;
-        u64 blkno;
+        u64 blkno, lower_blkno = 0;
-        ret = ocfs2_read_block(inode, p_blkno, &bh);
+        search = ocfs2_xattr_bucket_new(inode);
+        if (!search) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_read_xattr_bucket(search, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        xh = bucket_xh(search);
        high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
        while (low_bucket <= high_bucket) {
-                brelse(bh);
+                ocfs2_xattr_bucket_relse(search);
-                bh = NULL;
-                bucket = (low_bucket + high_bucket) / 2;
+                bucket = (low_bucket + high_bucket) / 2;
                blkno = p_blkno + bucket * blk_per_bucket;
+                ret = ocfs2_read_xattr_bucket(search, blkno);
-                ret = ocfs2_read_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                xh = (struct ocfs2_xattr_header *)bh->b_data;
+                xh = bucket_xh(search);
                xe = &xh->xh_entries[0];
                if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
                        high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                last_hash = le32_to_cpu(xe->xe_name_hash);
-                /* record lower_bh which may be the insert place. */
+                /* record lower_blkno which may be the insert place. */
-                brelse(lower_bh);
+                lower_blkno = blkno;
-                lower_bh = bh;
-                bh = NULL;
                if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
                        low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                }
                /* the searched xattr should reside in this bucket if exists. */
-                ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+                ret = ocfs2_find_xe_in_bucket(inode, search,
                                              name_index, name, name_hash,
                                              &index, &found);
                if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
         * When the xattr's hash value is in the gap of 2 buckets, we will
         * always set it to the previous bucket.
         */
-        if (!lower_bh) {
+        if (!lower_blkno)
-                /*
+                lower_blkno = p_blkno;
-                 * We can't find any bucket whose first name_hash is less
-                 * than the find name_hash.
+        /* This should be in cache - we just read it during the search */
-                 */
+        ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
-                BUG_ON(bh->b_blocknr != p_blkno);
+        if (ret) {
-                lower_bh = bh;
+                mlog_errno(ret);
-                bh = NULL;
+                goto out;
        }
-        xs->bucket.bhs[0] = lower_bh;
-        xs->bucket.xh = (struct ocfs2_xattr_header *)
-                                        xs->bucket.bhs[0]->b_data;
-        lower_bh = NULL;
-        xs->header = xs->bucket.xh;
+        xs->header = bucket_xh(xs->bucket);
-        xs->base = xs->bucket.bhs[0]->b_data;
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
        if (found) {
-                /*
-                 * If we have found the xattr enty, read all the blocks in
-                 * this bucket.
-                 */
-                ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
                xs->here = &xs->header->xh_entries[index];
                mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-                     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+                     (unsigned long long)bucket_blkno(xs->bucket), index);
        } else
                ret = -ENODATA;
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(search);
-        brelse(lower_bh);
        return ret;
 }
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                                       xattr_bucket_func *func,
                                       void *para)
 {
-        int i, j, ret = 0;
+        int i, ret = 0;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
        u32 num_buckets = clusters * bpc;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
-        memset(&bucket, 0, sizeof(bucket));
+        bucket = ocfs2_xattr_bucket_new(inode);
+        if (!bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
             clusters, (unsigned long long)blkno);
-        for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+        for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
-                ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+                ret = ocfs2_read_xattr_bucket(bucket, blkno);
-                                        bucket.bhs, 0);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        break;
                }
-                bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
                /*
                 * The real bucket num in this series of blocks is stored
                 * in the 1st bucket.
                 */
                if (i == 0)
-                        num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+                        num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
                mlog(0, "iterating xattr bucket %llu, first hash %u\n",
                     (unsigned long long)blkno,
-                     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
-                        ret = func(inode, &bucket, para);
+                        ret = func(inode, bucket, para);
-                        if (ret) {
+                        if (ret)
                                mlog_errno(ret);
-                                break;
+                        /* Fall through to bucket_relse() */
-                        }
                }
-                for (j = 0; j < blk_per_bucket; j++)
+                ocfs2_xattr_bucket_relse(bucket);
-                        brelse(bucket.bhs[j]);
+                if (ret)
-                memset(&bucket, 0, sizeof(bucket));
+                        break;
        }
-out:
+        ocfs2_xattr_bucket_free(bucket);
-        for (j = 0; j < blk_per_bucket; j++)
-                brelse(bucket.bhs[j]);
        return ret;
 }
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
        int i, block_off, new_offset;
        const char *prefix, *name;
-        for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+        for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
-                struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+                struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
                type = ocfs2_xattr_get_type(entry);
                prefix = ocfs2_xattr_prefix(type);
                if (prefix) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                bucket->xh,
+                                                                bucket_xh(bucket),
                                                                i,
                                                                &block_off,
                                                                &new_offset);
                        if (ret)
                                break;
-                        name = (const char *)bucket->bhs[block_off]->b_data +
+                        name = (const char *)bucket_block(bucket, block_off) +
                                new_offset;
                        ret = ocfs2_xattr_list_entry(xl->buffer,
                                                     xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
 /*
 * When the ocfs2_xattr_block is filled up, new bucket will be created
 * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
 * Note: we need to sort the entries since they are not saved in order
 * in the ocfs2_xattr_block.
 */
 static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
                                           struct buffer_head *xb_bh,
-                                           struct buffer_head *xh_bh,
+                                           struct ocfs2_xattr_bucket *bucket)
-                                           struct buffer_head *data_bh)
 {
        int i, blocksize = inode->i_sb->s_blocksize;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 offset, size, off_change;
        struct ocfs2_xattr_entry *xe;
        struct ocfs2_xattr_block *xb =
                                (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                                (struct ocfs2_xattr_header *)xh_bh->b_data;
        u16 count = le16_to_cpu(xb_xh->xh_count);
-        char *target = xh_bh->b_data, *src = xb_bh->b_data;
+        char *src = xb_bh->b_data;
+        char *target = bucket_block(bucket, blks - 1);
        mlog(0, "cp xattr from block %llu to bucket %llu\n",
             (unsigned long long)xb_bh->b_blocknr,
-             (unsigned long long)xh_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(bucket));
+        for (i = 0; i < blks; i++)
+                memset(bucket_block(bucket, i), 0, blocksize);
-        memset(xh_bh->b_data, 0, blocksize);
-        if (data_bh)
-                memset(data_bh->b_data, 0, blocksize);
        /*
         * Since the xe_name_offset is based on ocfs2_xattr_header,
         * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        size = blocksize - offset;
        /* copy all the names and values. */
-        if (data_bh)
-                target = data_bh->b_data;
        memcpy(target + offset, src + offset, size);
        /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
        /* copy all the entries. */
-        target = xh_bh->b_data;
+        target = bucket_block(bucket, 0);
        offset = offsetof(struct ocfs2_xattr_header, xh_entries);
        size = count * sizeof(struct ocfs2_xattr_entry);
        memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 * While if the entry is in index b-tree, "bucket" indicates the
 * real place of the xattr.
 */
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
-                                           struct ocfs2_xattr_search *xs,
+                                            struct ocfs2_xattr_search *xs,
-                                           struct buffer_head *old_bh,
+                                            struct buffer_head *old_bh)
-                                           struct buffer_head *new_bh)
 {
-        int ret = 0;
        char *buf = old_bh->b_data;
        struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
        struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
-        int i, blocksize = inode->i_sb->s_blocksize;
+        int i;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        xs->bucket.bhs[0] = new_bh;
-        get_bh(new_bh);
-        xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
-        xs->header = xs->bucket.xh;
-        xs->base = new_bh->b_data;
+        xs->header = bucket_xh(xs->bucket);
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
-        if (!xs->not_found) {
+        if (xs->not_found)
-                if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+                return;
-                        ret = ocfs2_read_blocks(inode,
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                        if (ret) {
-                                mlog_errno(ret);
-                                return ret;
-                        }
-                }
-                i = xs->here - old_xh->xh_entries;
-                xs->here = &xs->header->xh_entries[i];
-        }
-        return ret;
+        i = xs->here - old_xh->xh_entries;
+        xs->here = &xs->header->xh_entries[i];
 }
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs)
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits = OCFS2_SUBALLOC_ALLOC;
+        int ret;
        u32 bit_off, len;
        u64 blkno;
-        handle_t *handle;
+        handle_t *handle = ctxt->handle;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_alloc_context *data_ac;
-        struct buffer_head *xh_bh = NULL, *data_bh = NULL;
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xr;
        u16 xb_flags = le16_to_cpu(xb->xb_flags);
-        u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog(0, "create xattr index block for %llu\n",
             (unsigned long long)xb_bh->b_blocknr);
        BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+        BUG_ON(!xs->bucket);
-        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /*
         * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
         */
        down_write(&oi->ip_alloc_sem);
-        /*
+        ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
-         * 3 more credits, one for xattr block update, one for the 1st block
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-         * of the new xattr bucket and one for the value/data.
-         */
-        credits += 3;
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out_sem;
-        }
-        ret = ocfs2_journal_access(handle, inode, xb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        mlog(0, "allocate 1 cluster from %llu to xattr block\n",
             (unsigned long long)blkno);
-        xh_bh = sb_getblk(inode->i_sb, blkno);
+        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
-        if (!xh_bh) {
+        if (ret) {
-                ret = -EIO;
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+                                                OCFS2_JOURNAL_ACCESS_CREATE);
-        ret = ocfs2_journal_access(handle, inode, xh_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
-        }
-        if (bpb > 1) {
-                data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
-                if (!data_bh) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, data_bh);
-                ret = ocfs2_journal_access(handle, inode, data_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
        }
-        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        ocfs2_journal_dirty(handle, xh_bh);
+        ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
-        if (data_bh)
-                ocfs2_journal_dirty(handle, data_bh);
-        ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
-        ret = ocfs2_journal_dirty(handle, xb_bh);
+        ocfs2_journal_dirty(handle, xb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out_sem:
-        up_write(&oi->ip_alloc_sem);
 out:
-        if (data_ac)
+        up_write(&oi->ip_alloc_sem);
-                ocfs2_free_alloc_context(data_ac);
-        brelse(xh_bh);
-        brelse(data_bh);
        return ret;
 }
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
 * so that we can spare some space for insertion.
 */
 static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+                                     handle_t *handle,
                                     struct ocfs2_xattr_bucket *bucket)
 {
        int ret, i;
        size_t end, offset, len, value_len;
        struct ocfs2_xattr_header *xh;
        char *entries, *buf, *bucket_buf = NULL;
-        u64 blkno = bucket->bhs[0]->b_blocknr;
+        u64 blkno = bucket_blkno(bucket);
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 xh_free_start;
        size_t blocksize = inode->i_sb->s_blocksize;
-        handle_t *handle;
-        struct buffer_head **bhs;
        struct ocfs2_xattr_entry *xe;
-        bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-                        GFP_NOFS);
-        if (!bhs)
-                return -ENOMEM;
-        ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
-        if (ret)
-                goto out;
        /*
         * In order to make the operation more efficient and generic,
         * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        }
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(buf, bhs[i]->b_data, blocksize);
+                memcpy(buf, bucket_block(bucket, i), blocksize);
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto commit;
-                }
-        }
        xh = (struct ocfs2_xattr_header *)bucket_buf;
        entries = (char *)xh->xh_entries;
        xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
                        "bucket %llu\n", (unsigned long long)blkno);
        if (xh_free_start == end)
-                goto commit;
+                goto out;
        memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
        xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
             cmp_xe, swap_xe);
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(bhs[i]->b_data, buf, blocksize);
+                memcpy(bucket_block(bucket, i), buf, blocksize);
-                ocfs2_journal_dirty(handle, bhs[i]);
+        ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-        }
-commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-        if (bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(bhs[i]);
-        }
-        kfree(bhs);
        kfree(bucket_buf);
        return ret;
 }
 /*
- * Move half nums of the xattr bucket in the previous cluster to this new
+ * prev_blkno points to the start of an existing extent.  new_blkno
- * cluster. We only touch the last cluster of the previous extend record.
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
 *
- * first_bh is the first buffer_head of a series of bucket in the same
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
 */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
                                               handle_t *handle,
-                                               struct buffer_head **first_bh,
+                                               struct ocfs2_xattr_bucket *first,
-                                               struct buffer_head **header_bh,
+                                               struct ocfs2_xattr_bucket *target,
                                               u64 new_blkno,
-                                               u64 prev_blkno,
                                               u32 num_clusters,
                                               u32 *first_hash)
 {
-        int i, ret, credits;
+        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
-        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+        int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        int blocksize = inode->i_sb->s_blocksize;
+        int to_move = num_buckets / 2;
-        struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+        u64 src_blkno;
-        struct ocfs2_xattr_header *new_xh;
+        u64 last_cluster_blkno = bucket_blkno(first) +
-        struct ocfs2_xattr_header *xh =
+                ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
-                        (struct ocfs2_xattr_header *)((*first_bh)->b_data);
-        BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
-        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-        prev_bh = *first_bh;
-        get_bh(prev_bh);
-        xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
-        prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+        BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-             (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+             (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
-        /*
+        ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
-         * We need to update the 1st half of the new cluster and
+                                     last_cluster_blkno, new_blkno,
-         * 1 more for the update of the 1st bucket of the previous
+                                     to_move, first_hash);
-         * extent record.
-         */
-        credits = bpc / 2 + 1;
-        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, prev_bh,
+        /* This is the first bucket that got moved */
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+        /*
-                old_bh = new_bh = NULL;
+         * If the target bucket was part of the moved buckets, we need to
-                new_bh = sb_getblk(inode->i_sb, new_blkno);
+         * update first and target.
-                if (!new_bh) {
+         */
-                        ret = -EIO;
+        if (bucket_blkno(target) >= src_blkno) {
-                        mlog_errno(ret);
+                /* Find the block for the new target bucket */
-                        goto out;
+                src_blkno = new_blkno +
-                }
+                        (bucket_blkno(target) - src_blkno);
-                ocfs2_set_new_buffer_uptodate(inode, new_bh);
+                ocfs2_xattr_bucket_relse(first);
+                ocfs2_xattr_bucket_relse(target);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                /*
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                 * These shouldn't fail - the buffers are in the
-                if (ret < 0) {
+                 * journal from ocfs2_cp_xattr_bucket().
+                 */
+                ret = ocfs2_read_xattr_bucket(first, new_blkno);
+                if (ret) {
                        mlog_errno(ret);
-                        brelse(new_bh);
                        goto out;
                }
+                ret = ocfs2_read_xattr_bucket(target, src_blkno);
-                ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+                if (ret)
-                if (ret < 0) {
                        mlog_errno(ret);
-                        brelse(new_bh);
-                        goto out;
-                }
-                memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-                if (i == 0) {
-                        new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
-                        new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-                        if (first_hash)
-                                *first_hash = le32_to_cpu(
-                                        new_xh->xh_entries[0].xe_name_hash);
-                        new_first_bh = new_bh;
-                        get_bh(new_first_bh);
-                }
-                ocfs2_journal_dirty(handle, new_bh);
-                if (*header_bh == old_bh) {
-                        brelse(*header_bh);
-                        *header_bh = new_bh;
-                        get_bh(*header_bh);
-                        brelse(*first_bh);
-                        *first_bh = new_first_bh;
-                        get_bh(*first_bh);
-                }
-                brelse(new_bh);
-                brelse(old_bh);
        }
-        le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-        ocfs2_journal_dirty(handle, prev_bh);
 out:
-        brelse(prev_bh);
-        brelse(new_first_bh);
-        return ret;
-}
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-                                   u64 blkno,
-                                   struct buffer_head **bhs,
-                                   int new)
-{
-        int ret = 0;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        if (!new)
-                return ocfs2_read_blocks(inode, blkno,
-                                         blk_per_bucket, bhs, 0);
-        for (i = 0; i < blk_per_bucket; i++) {
-                bhs[i] = sb_getblk(inode->i_sb, blkno + i);
-                if (bhs[i] == NULL) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        break;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-        }
        return ret;
 }
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 {
        int ret, i;
        int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        mlog(0, "move some of xattrs from bucket %llu to %llu\n",
             (unsigned long long)blk, (unsigned long long)new_blk);
-        s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-                return -ENOMEM;
+        if (!s_bucket || !t_bucket) {
+                ret = -ENOMEM;
-        ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
-        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+        ret = ocfs2_read_xattr_bucket(s_bucket, blk);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
-        if (!t_bhs) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = -ENOMEM;
+        if (ret) {
+                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+        /*
+         * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+         * there's no need to read it.
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
+        /*
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+         * Hey, if we're overwriting t_bucket, what difference does
-                                           new_bucket_head ?
+         * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
+         * same part of ocfs2_cp_xattr_bucket().
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+         */
-                if (ret) {
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
-                        mlog_errno(ret);
+                                                new_bucket_head ?
-                        goto out;
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
-                }
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        count = le16_to_cpu(xh->xh_count);
        start = ocfs2_xattr_find_divide_pos(xh);
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
                 * The hash value is set as one larger than
                 * that of the last entry in the previous bucket.
                 */
-                for (i = 0; i < blk_per_bucket; i++)
+                for (i = 0; i < t_bucket->bu_blocks; i++)
-                        memset(t_bhs[i]->b_data, 0, blocksize);
+                        memset(bucket_block(t_bucket, i), 0, blocksize);
-                xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+                xh = bucket_xh(t_bucket);
                xh->xh_free_start = cpu_to_le16(blocksize);
                xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
                le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        }
        /* copy the whole bucket to the new first. */
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
        /* update the new bucket. */
-        xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+        xh = bucket_xh(t_bucket);
        /*
         * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
        else
                xh->xh_num_buckets = 0;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-                if (ret)
-                        mlog_errno(ret);
-        }
        /* store the first_hash of the new bucket. */
        if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
        if (start == count)
                goto out;
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        memset(&xh->xh_entries[start], 0,
               sizeof(struct ocfs2_xattr_entry) * (count - start));
        xh->xh_count = cpu_to_le16(start);
        xh->xh_free_start = cpu_to_le16(name_offset);
        xh->xh_name_value_len = cpu_to_le16(name_value_len);
-        ocfs2_journal_dirty(handle, s_bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
-        if (ret)
-                mlog_errno(ret);
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(s_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(t_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
                                 u64 t_blkno,
                                 int t_is_new)
 {
-        int ret, i;
+        int ret;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        int blocksize = inode->i_sb->s_blocksize;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        BUG_ON(s_blkno == t_blkno);
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
             (unsigned long long)s_blkno, (unsigned long long)t_blkno,
             t_is_new);
-        s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-                        GFP_NOFS);
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        if (!s_bucket || !t_bucket) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+        ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
        if (ret)
                goto out;
-        t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        /*
-                        GFP_NOFS);
+         * Even if !t_is_new, we're overwriting t_bucket.  Thus,
-        if (!t_bhs) {
+         * there's no need to read it.
-                ret = -ENOMEM;
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+        if (ret)
                goto out;
-        }
-        ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+        /*
+         * Hey, if we're overwriting t_bucket, what difference does
+         * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+         * cluster to fill, we came here from
+         * ocfs2_mv_xattr_buckets(), and it is really new -
+         * ACCESS_CREATE is required.  But we also might have moved data
+         * out of t_bucket before extending back into it.
+         * ocfs2_add_new_xattr_bucket() can do this - its call to
+         * ocfs2_add_new_xattr_cluster() may have created a new extent
+         * and copied out the end of the old extent.  Then it re-extends
+         * the old extent back to create space for new xattrs.  That's
+         * how we get here, and the bucket isn't really new.
+         */
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+                                                t_is_new ?
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret)
                goto out;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                                           t_is_new ?
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret)
-                        goto out;
-        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-        }
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(t_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(s_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
 /*
- * Copy one xattr cluster from src_blk to to_blk.
+ * src_blk points to the start of an existing extent.  last_blk points to
- * The to_blk will become the first bucket header of the cluster, so its
+ * last cluster in that extent.  to_blk points to a newly allocated
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
 */
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
-                                  handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
-                                  struct buffer_head *first_bh,
+                                  unsigned int start_bucket,
-                                  u64 src_blk,
-                                  u64 to_blk,
                                  u32 *first_hash)
 {
        int i, ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_bucket *old_first, *new_first;
-        struct ocfs2_xattr_header *xh;
-        u64 to_blk_start = to_blk;
+        mlog(0, "mv xattrs from cluster %llu to %llu\n",
+             (unsigned long long)last_blk, (unsigned long long)to_blk);
+        BUG_ON(start_bucket >= num_buckets);
+        if (start_bucket) {
+                num_buckets -= start_bucket;
+                last_blk += (start_bucket * blks_per_bucket);
+        }
+        /* The first bucket of the original extent */
+        old_first = ocfs2_xattr_bucket_new(inode);
+        /* The first bucket of the new extent */
+        new_first = ocfs2_xattr_bucket_new(inode);
+        if (!old_first || !new_first) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        mlog(0, "cp xattrs from cluster %llu to %llu\n",
+        ret = ocfs2_read_xattr_bucket(old_first, src_blk);
-             (unsigned long long)src_blk, (unsigned long long)to_blk);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
-         * We need to update the new cluster and 1 more for the update of
+         * We need to update the first bucket of the old extent and all
-         * the 1st bucket of the previous extent rec.
+         * the buckets going to the new extent.
         */
-        credits = bpc + 1;
+        credits = ((num_buckets + 1) * blks_per_bucket) +
+                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
        for (i = 0; i < num_buckets; i++) {
                ret = ocfs2_cp_xattr_bucket(inode, handle,
-                                            src_blk, to_blk, 1);
+                                            last_blk + (i * blks_per_bucket),
+                                            to_blk + (i * blks_per_bucket),
+                                            1);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-                to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        }
-        /* update the old bucket header. */
+        /*
-        xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+         * Get the new bucket ready before we dirty anything
-        le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+         * (This actually shouldn't fail, because we already dirtied
+         * it once in ocfs2_cp_xattr_bucket()).
-        ocfs2_journal_dirty(handle, first_bh);
+         */
+        ret = ocfs2_read_xattr_bucket(new_first, to_blk);
-        /* update the new bucket header. */
+        if (ret) {
-        ret = ocfs2_read_block(inode, to_blk_start, &bh);
-        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
-        ret = ocfs2_journal_access(handle, inode, bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        /* Now update the headers */
-        xh->xh_num_buckets = cpu_to_le16(num_buckets);
+        le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, old_first);
-        ocfs2_journal_dirty(handle, bh);
+        bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, new_first);
        if (first_hash)
-                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+                *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(new_first);
+        ocfs2_xattr_bucket_free(old_first);
        return ret;
 }
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket;
+        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 */
 static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head **first_bh,
+                                            struct ocfs2_xattr_bucket *first,
-                                            struct buffer_head **header_bh,
+                                            struct ocfs2_xattr_bucket *target,
                                            u64 new_blk,
-                                            u64 prev_blk,
                                            u32 prev_clusters,
                                            u32 *v_start,
                                            int *extend)
 {
-        int ret = 0;
+        int ret;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-             (unsigned long long)prev_blk, prev_clusters,
+             (unsigned long long)bucket_blkno(first), prev_clusters,
             (unsigned long long)new_blk);
-        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
                                                          handle,
-                                                          first_bh,
+                                                          first, target,
-                                                          header_bh,
                                                          new_blk,
-                                                          prev_blk,
                                                          prev_clusters,
                                                          v_start);
-        else {
+                if (ret)
-                u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+                        mlog_errno(ret);
+        } else {
-                if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+                /* The start of the last cluster in the first extent */
-                        ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+                u64 last_blk = bucket_blkno(first) +
-                                                     last_blk, new_blk,
+                        ((prev_clusters - 1) *
+                         ocfs2_clusters_to_blocks(inode->i_sb, 1));
+                if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+                        ret = ocfs2_mv_xattr_buckets(inode, handle,
+                                                     bucket_blkno(first),
+                                                     last_blk, new_blk, 0,
                                                     v_start);
-                else {
+                        if (ret)
+                                mlog_errno(ret);
+                } else {
                        ret = ocfs2_divide_xattr_cluster(inode, handle,
                                                         last_blk, new_blk,
                                                         v_start);
+                        if (ret)
+                                mlog_errno(ret);
-                        if ((*header_bh)->b_blocknr == last_blk && extend)
+                        if ((bucket_blkno(target) == last_blk) && extend)
                                *extend = 0;
                }
        }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 */
 static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                                       struct buffer_head *root_bh,
-                                       struct buffer_head **first_bh,
+                                       struct ocfs2_xattr_bucket *first,
-                                       struct buffer_head **header_bh,
+                                       struct ocfs2_xattr_bucket *target,
                                       u32 *num_clusters,
                                       u32 prev_cpos,
-                                       u64 prev_blkno,
+                                       int *extend,
-                                       int *extend)
+                                       struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits;
+        int ret;
        u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 prev_clusters = *num_clusters;
        u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
        u64 block;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
             "previous xattr blkno = %llu\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             prev_cpos, (unsigned long long)prev_blkno);
+             prev_cpos, (unsigned long long)bucket_blkno(first));
        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
-        ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                    &data_ac, &meta_ac);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(ret);
-                goto leave;
-        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        if (prev_blkno + prev_clusters * bpc == block &&
+        if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
            (prev_clusters + num_bits) << osb->s_clustersize_bits <=
             OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
                /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        } else {
                ret = ocfs2_adjust_xattr_cross_cluster(inode,
                                                       handle,
-                                                       first_bh,
+                                                       first,
-                                                       header_bh,
+                                                       target,
                                                       block,
-                                                       prev_blkno,
                                                       prev_clusters,
                                                       &v_start,
                                                       extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                }
        }
-        if (handle->h_buffer_credits < credits) {
-                /*
-                 * The journal has been restarted before, and don't
-                 * have enough space for the insertion, so extend it
-                 * here.
-                 */
-                ret = ocfs2_extend_trans(handle, credits);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto leave;
-                }
-        }
        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
             num_bits, (unsigned long long)block, v_start);
        ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-                                  num_bits, 0, meta_ac);
+                                  num_bits, 0, ctxt->meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0) {
+        if (ret < 0)
                mlog_errno(ret);
-                goto leave;
-        }
 leave:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (data_ac)
-                ocfs2_free_alloc_context(data_ac);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 /*
- * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We are given an extent.  'first' is the bucket at the very front of
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
 */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
-                                     struct buffer_head *first_bh,
+                                     handle_t *handle,
-                                     struct buffer_head *start_bh,
+                                     struct ocfs2_xattr_bucket *first,
+                                     u64 target_blk,
                                     u32 num_clusters)
 {
        int ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        u64 start_blk = start_bh->b_blocknr, end_blk;
+        u64 end_blk;
-        u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+        u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
-        handle_t *handle;
-        struct ocfs2_xattr_header *first_xh =
-                                (struct ocfs2_xattr_header *)first_bh->b_data;
-        u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-             "from %llu, len = %u\n", (unsigned long long)start_blk,
+             "from %llu, len = %u\n", (unsigned long long)target_blk,
-             (unsigned long long)first_bh->b_blocknr, num_clusters);
+             (unsigned long long)bucket_blkno(first), num_clusters);
-        BUG_ON(bucket >= num_buckets);
+        /* The extent must have room for an additional bucket */
+        BUG_ON(new_bucket >=
+               (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
-        end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+        /* end_blk points to the last existing bucket */
+        end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
        /*
-         * We will touch all the buckets after the start_bh(include it).
+         * end_blk is the start of the last existing bucket.
-         * Add one more bucket and modify the first_bh.
+         * Thus, (end_blk - target_blk) covers the target bucket and
+         * every bucket after it up to, but not including, the last
+         * existing bucket.  Then we add the last existing bucket, the
+         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
-        handle = ocfs2_start_trans(osb, credits);
+                  handle->h_buffer_credits;
-        if (IS_ERR(handle)) {
+        ret = ocfs2_extend_trans(handle, credits);
-                ret = PTR_ERR(handle);
+        if (ret) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto commit;
+                goto out;
        }
-        while (end_blk != start_blk) {
+        while (end_blk != target_blk) {
                ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
                                            end_blk + blk_per_bucket, 0);
                if (ret)
-                        goto commit;
+                        goto out;
                end_blk -= blk_per_bucket;
        }
-        /* Move half of the xattr in start_blk to the next bucket. */
+        /* Move half of the xattr in target_blkno to the next bucket. */
-        ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+        ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
-                                        start_blk + blk_per_bucket, NULL, 0);
+                                        target_blk + blk_per_bucket, NULL, 0);
-        le16_add_cpu(&first_xh->xh_num_buckets, 1);
+        le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
-        ocfs2_journal_dirty(handle, first_bh);
+        ocfs2_xattr_bucket_journal_dirty(handle, first);
-commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
 /*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * Add new xattr bucket in an extent record and adjust the buckets
- * xb_bh is the ocfs2_xattr_block.
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
- * We will move all the buckets starting from header_bh to the next place. As
+ * bucket we want to insert into.
- * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
 *
- * We will allocate a new cluster if current cluster is full and adjust
+ * If current cluster is full, we'll allocate a new one.  This may not
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
 */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                                      struct buffer_head *xb_bh,
-                                      struct buffer_head *header_bh)
+                                      struct ocfs2_xattr_bucket *target,
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        struct ocfs2_xattr_header *first_xh = NULL;
-        struct buffer_head *first_bh = NULL;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
        struct ocfs2_extent_list *el = &xb_root->xt_list;
-        struct ocfs2_xattr_header *xh =
+        u32 name_hash =
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
+                le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
-        u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct super_block *sb = inode->i_sb;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
        int ret, num_buckets, extend = 1;
        u64 p_blkno;
        u32 e_cpos, num_clusters;
+        /* The bucket at the front of the extent */
+        struct ocfs2_xattr_bucket *first;
-        mlog(0, "Add new xattr bucket starting form %llu\n",
+        mlog(0, "Add new xattr bucket starting from %llu\n",
-             (unsigned long long)header_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(target));
-        /*
+        /* The first bucket of the original extent */
-         * Add refrence for header_bh here because it may be
+        first = ocfs2_xattr_bucket_new(inode);
-         * changed in ocfs2_add_new_xattr_cluster and we need
+        if (!first) {
-         * to free it in the end.
+                ret = -ENOMEM;
-         */
+                mlog_errno(ret);
-        get_bh(header_bh);
+                goto out;
+        }
        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
                                  &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+        ret = ocfs2_read_xattr_bucket(first, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
-        first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+        if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+                /*
-        if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+                 * This can move first+target if the target bucket moves
+                 * to the new extent.
+                 */
                ret = ocfs2_add_new_xattr_cluster(inode,
                                                  xb_bh,
-                                                  &first_bh,
+                                                  first,
-                                                  &header_bh,
+                                                  target,
                                                  &num_clusters,
                                                  e_cpos,
-                                                  p_blkno,
+                                                  &extend,
-                                                  &extend);
+                                                  ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (extend)
+        if (extend) {
                ret = ocfs2_extend_xattr_bucket(inode,
-                                                first_bh,
+                                                ctxt->handle,
-                                                header_bh,
+                                                first,
+                                                bucket_blkno(target),
                                                num_clusters);
-        if (ret)
+                if (ret)
-                mlog_errno(ret);
+                        mlog_errno(ret);
+        }
 out:
-        brelse(first_bh);
+        ocfs2_xattr_bucket_free(first);
-        brelse(header_bh);
        return ret;
 }
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
        int block_off = offs >> inode->i_sb->s_blocksize_bits;
        offs = offs % inode->i_sb->s_blocksize;
-        return bucket->bhs[block_off]->b_data + offs;
+        return bucket_block(bucket, block_off) + offs;
 }
 /*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
                                xe->xe_value_size = 0;
                        val = ocfs2_xattr_bucket_get_val(inode,
-                                                         &xs->bucket, offs);
+                                                         xs->bucket, offs);
                        memset(val + OCFS2_XATTR_SIZE(name_len), 0,
                               size - OCFS2_XATTR_SIZE(name_len));
                        if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
                xh->xh_free_start = cpu_to_le16(offs);
        }
-        val = ocfs2_xattr_bucket_get_val(inode,
+        val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-                                         &xs->bucket, offs - size);
        xe->xe_name_offset = cpu_to_le16(offs - size);
        memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
        return;
 }
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
-                                             handle_t *handle,
-                                             struct ocfs2_xattr_search *xs,
-                                             struct buffer_head **bhs,
-                                             u16 bh_num)
-{
-        int ret = 0, off, block_off;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        /*
-         * First calculate all the blocks we should journal_access
-         * and journal_dirty. The first block should always be touched.
-         */
-        ret = ocfs2_journal_dirty(handle, bhs[0]);
-        if (ret)
-                mlog_errno(ret);
-        /* calc the data. */
-        off = le16_to_cpu(xe->xe_name_offset);
-        block_off = off >> inode->i_sb->s_blocksize_bits;
-        ret = ocfs2_journal_dirty(handle, bhs[block_off]);
-        if (ret)
-                mlog_errno(ret);
-        return ret;
-}
 /*
 * Set the xattr entry in the specified bucket.
 * The bucket is indicated by xs->bucket and it should have the enough
 * space for the xattr insertion.
 */
 static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_info *xi,
                                           struct ocfs2_xattr_search *xs,
                                           u32 name_hash,
                                           int local)
 {
-        int i, ret;
+        int ret;
-        handle_t *handle = NULL;
+        u64 blkno;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
             (unsigned long)xi->value_len, xi->name_index,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+             (unsigned long long)bucket_blkno(xs->bucket));
-        if (!xs->bucket.bhs[1]) {
+        if (!xs->bucket->bu_bhs[1]) {
-                ret = ocfs2_read_blocks(inode,
+                blkno = bucket_blkno(xs->bucket);
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
+                ocfs2_xattr_bucket_relse(xs->bucket);
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-                                        0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        handle = ocfs2_start_trans(osb, blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
        ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        /*Only dirty the blocks we have touched in set xattr. */
-        ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-                                                xs->bucket.bhs, blk_per_bucket);
-        if (ret)
-                mlog_errno(ret);
-out:
-        ocfs2_commit_trans(osb, handle);
-        return ret;
-}
-static int ocfs2_xattr_value_update_size(struct inode *inode,
-                                         struct buffer_head *xe_bh,
-                                         struct ocfs2_xattr_entry *xe,
-                                         u64 new_size)
-{
-        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle = NULL;
-        handle = ocfs2_start_trans(osb, 1);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        xe->xe_value_size = cpu_to_le64(new_size);
-        ret = ocfs2_journal_dirty(handle, xe_bh);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
@@ -4210,18 +4697,19 @@ out:
 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
 */
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
-                                             struct buffer_head *header_bh,
+                                             struct ocfs2_xattr_bucket *bucket,
                                             int xe_off,
-                                             int len)
+                                             int len,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        u64 value_blk;
-        struct buffer_head *value_bh = NULL;
-        struct ocfs2_xattr_value_root *xv;
        struct ocfs2_xattr_entry *xe;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t blocksize = inode->i_sb->s_blocksize;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_access = ocfs2_journal_access,
+        };
        xe = &xh->xh_entries[xe_off];
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
        /* We don't allow ocfs2_xattr_value to be stored in different block. */
        BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
-        value_blk += header_bh->b_blocknr;
-        ret = ocfs2_read_block(inode, value_blk, &value_bh);
+        vb.vb_bh = bucket->bu_bhs[value_blk];
-        if (ret) {
+        BUG_ON(!vb.vb_bh);
-                mlog_errno(ret);
-                goto out;
-        }
-        xv = (struct ocfs2_xattr_value_root *)
+        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                (value_bh->b_data + offset % blocksize);
+                (vb.vb_bh->b_data + offset % blocksize);
-        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+        ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
-             xe_off, (unsigned long long)header_bh->b_blocknr, len);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-        ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+        /*
+         * From here on out we have to dirty the bucket.  The generic
+         * value calls only modify one of the bucket's bhs, but we need
+         * to send the bucket at once.  So if they error, they *could* have
+         * modified something.  We have to assume they did, and dirty
+         * the whole bucket.  This leaves us in a consistent state.
+         */
+        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+             xe_off, (unsigned long long)bucket_blkno(bucket), len);
+        ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                goto out_dirty;
        }
+        xe->xe_value_size = cpu_to_le64(len);
+out_dirty:
+        ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 out:
-        brelse(value_bh);
        return ret;
 }
 static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-                                                struct ocfs2_xattr_search *xs,
+                                        struct ocfs2_xattr_search *xs,
-                                                int len)
+                                        int len,
+                                        struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        struct ocfs2_xattr_entry *xe = xs->here;
        struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-        BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+        BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
        offset = xe - xh->xh_entries;
-        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-                                                offset, len);
+                                                offset, len, ctxt);
        if (ret)
                mlog_errno(ret);
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 }
 static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+                                                handle_t *handle,
                                                struct ocfs2_xattr_search *xs,
                                                char *val,
                                                int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
        xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
-        return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+        return __ocfs2_xattr_set_value_outside(inode, handle,
+                                               xv, val, value_len);
 }
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -4392,26 +4891,19 @@ out:
 }
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+                                         handle_t *handle,
                                         struct ocfs2_xattr_search *xs)
 {
-        handle_t *handle = NULL;
+        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-        struct ocfs2_xattr_header *xh = xs->bucket.xh;
        struct ocfs2_xattr_entry *last = &xh->xh_entries[
                                                le16_to_cpu(xh->xh_count) - 1];
        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                return;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                return;
        }
        /* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
        le16_add_cpu(&xh->xh_count, -1);
-        ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 }
 /*
@@ -4440,7 +4928,8 @@ out_commit:
 */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                                     struct ocfs2_xattr_info *xi,
-                                     struct ocfs2_xattr_search *xs)
+                                     struct ocfs2_xattr_search *xs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, local = 1;
        size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                        value_len = 0;
                ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                           value_len);
+                                                           value_len,
+                                                           ctxt);
                if (ret)
                        goto out;
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                xi->value_len = OCFS2_XATTR_ROOT_SIZE;
        }
-        ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+        ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+                                              name_hash, local);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
        /* allocate the space now for the outside block storage. */
        ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                   value_len);
+                                                   value_len, ctxt);
        if (ret) {
                mlog_errno(ret);
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                         * storage and we have allocated xattr already,
                         * so need to remove it.
                         */
-                        ocfs2_xattr_bucket_remove_xs(inode, xs);
+                        ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
                }
                goto out;
        }
 set_value_outside:
-        ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+        ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+                                                   xs, val, value_len);
 out:
        return ret;
 }
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
                                              struct ocfs2_xattr_bucket *bucket,
                                              const char *name)
 {
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
        if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
            xh->xh_entries[0].xe_name_hash) {
                mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
                     "hash = %u\n",
-                     (unsigned long long)bucket->bhs[0]->b_blocknr,
+                     (unsigned long long)bucket_blkno(bucket),
                     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
                return -ENOSPC;
        }
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs)
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        u16 count, header_size, xh_free_start;
-        int i, free, max_free, need, old;
+        int free, max_free, need, old;
        size_t value_size = 0, name_len = strlen(xi->name);
        size_t blocksize = inode->i_sb->s_blocksize;
        int ret, allocation = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog_entry("Set xattr %s in xattr index block\n", xi->name);
@@ -4574,7 +5066,7 @@ try_again:
        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
                        "of %u which exceed block size\n",
-                        (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+                        (unsigned long long)bucket_blkno(xs->bucket),
                        header_size);
        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
        mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
             "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
             " %u\n", xs->not_found,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+             (unsigned long long)bucket_blkno(xs->bucket),
             free, need, max_free, le16_to_cpu(xh->xh_free_start),
             le16_to_cpu(xh->xh_name_value_len));
-        if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+        if (free < need ||
+            (xs->not_found &&
+             count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
                if (need <= max_free &&
                    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
                        /*
@@ -4626,7 +5120,8 @@ try_again:
                         * name/value will be moved, the xe shouldn't be changed
                         * in xs.
                         */
-                        ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+                        ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+                                                        xs->bucket);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -4658,7 +5153,7 @@ try_again:
                 * add a new bucket for the insert.
                 */
                ret = ocfs2_check_xattr_bucket_collision(inode,
-                                                         &xs->bucket,
+                                                         xs->bucket,
                                                         xi->name);
                if (ret) {
                        mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
                ret = ocfs2_add_new_xattr_bucket(inode,
                                                 xs->xattr_bh,
-                                                 xs->bucket.bhs[0]);
+                                                 xs->bucket,
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                for (i = 0; i < blk_per_bucket; i++)
+                /*
-                        brelse(xs->bucket.bhs[i]);
+                 * ocfs2_add_new_xattr_bucket() will have updated
+                 * xs->bucket if it moved, but it will not have updated
-                memset(&xs->bucket, 0, sizeof(xs->bucket));
+                 * any of the other search fields.  Thus, we drop it and
+                 * re-search.  Everything should be cached, so it'll be
+                 * quick.
+                 */
+                ocfs2_xattr_bucket_relse(xs->bucket);
                ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
                                                   xi->name_index,
                                                   xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
        }
 xattr_set:
-        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
 out:
        mlog_exit(ret);
        return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
                                        void *para)
 {
        int ret = 0;
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u16 i;
        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+        int credits = ocfs2_remove_extent_credits(osb->sb) +
+                ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
                xe = &xh->xh_entries[i];
                if (ocfs2_xattr_is_local(xe))
                        continue;
-                ret = ocfs2_xattr_bucket_value_truncate(inode,
+                ctxt.handle = ocfs2_start_trans(osb, credits);
-                                                        bucket->bhs[0],
+                if (IS_ERR(ctxt.handle)) {
-                                                        i, 0);
+                        ret = PTR_ERR(ctxt.handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+                                                        i, 0, &ctxt);
+                ocfs2_commit_trans(osb, ctxt.handle);
                if (ret) {
                        mlog_errno(ret);
                        break;
                }
        }
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
        return ret;
 }
@@ -4768,6 +5284,74 @@ out:
 }
 /*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+                                        size_t list_size, const char *name,
+                                        size_t name_len)
+{
+        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+                                    void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+                               buffer, size);
+}
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+                                    const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+                               size, flags);
+}
+int ocfs2_init_security_get(struct inode *inode,
+                            struct inode *dir,
+                            struct ocfs2_security_xattr_info *si)
+{
+        /* check whether ocfs2 support feature xattr */
+        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+                return -EOPNOTSUPP;
+        return security_inode_init_security(inode, dir, &si->name, &si->value,
+                                            &si->value_len);
+}
+int ocfs2_init_security_set(handle_t *handle,
+                            struct inode *inode,
+                            struct buffer_head *di_bh,
+                            struct ocfs2_security_xattr_info *si,
+                            struct ocfs2_alloc_context *xattr_ac,
+                            struct ocfs2_alloc_context *data_ac)
+{
+        return ocfs2_xattr_set_handle(handle, inode, di_bh,
+                                     OCFS2_XATTR_INDEX_SECURITY,
+                                     si->name, si->value, si->value_len, 0,
+                                     xattr_ac, data_ac);
+}
+struct xattr_handler ocfs2_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = ocfs2_xattr_security_list,
+        .get    = ocfs2_xattr_security_get,
+        .set    = ocfs2_xattr_security_set,
+};
+/*
 * 'trusted' attributes support
 */
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
        OCFS2_XATTR_MAX
 };
+struct ocfs2_security_xattr_info {
+        int enable;
+        char *name;
+        void *value;
+        size_t value_len;
+};
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+                           const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
                    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+                           int, const char *, const void *, size_t, int,
+                           struct ocfs2_alloc_context *,
+                           struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+                            struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+                            struct buffer_head *,
+                            struct ocfs2_security_xattr_info *,
+                            struct ocfs2_alloc_context *,
+                            struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+                             struct ocfs2_security_xattr_info *,
+                             int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+                          int, struct ocfs2_security_xattr_info *,
+                          int *, int *, struct ocfs2_alloc_context **);
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+        struct buffer_head              *vb_bh;
+        ocfs2_journal_access_func       vb_access;
+        struct ocfs2_xattr_value_root   *vb_xv;
+};
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &omfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 1cd7d40e9991..d882fd2351d6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -412,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                goto out_fput;
-        if (inode->i_op && inode->i_op->fallocate)
+        if (inode->i_op->fallocate)
                ret = inode->i_op->fallocate(inode, mode, offset, len);
        else
                ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
                break;
        }
-        inode->i_gid = 0;
-        inode->i_uid = 0;
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..5198ada67398 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -384,9 +384,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        dname = dev_name(ddev);
        if (isdigit(dname[strlen(dname) - 1]))
-                snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
+                dev_set_name(pdev, "%sp%d", dname, partno);
        else
-                snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+                dev_set_name(pdev, "%s%d", dname, partno);
        device_initialize(pdev);
        pdev->class = &block_class;
@@ -447,16 +447,11 @@ void register_disk(struct gendisk *disk)
        struct block_device *bdev;
        struct disk_part_iter piter;
        struct hd_struct *part;
-        char *s;
        int err;
        ddev->parent = disk->driverfs_dev;
-        strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
+        dev_set_name(ddev, disk->disk_name);
-        /* ewww... some of these buggers have / in the name... */
-        s = strchr(ddev->bus_id, '/');
-        if (s)
-                *s = '!';
        /* delay uevents, until we scanned partition table */
        ddev->uevent_suppress = 1;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
 #include <linux/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
 #include <linux/resource.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
        .op   = OP,                                     \
 }
-#define DIR(NAME, MODE, OTYPE)                                                  \
+#define DIR(NAME, MODE, iops, fops)     \
-        NOD(NAME, (S_IFDIR|(MODE)),                                             \
+        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
-                &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,   \
+#define LNK(NAME, get_link)                                     \
-                {} )
-#define LNK(NAME, OTYPE)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
                &proc_pid_link_inode_operations, NULL,          \
-                { .proc_get_link = &proc_##OTYPE##_link } )
+                { .proc_get_link = get_link } )
-#define REG(NAME, MODE, OTYPE)                          \
+#define REG(NAME, MODE, fops)                           \
-        NOD(NAME, (S_IFREG|(MODE)), NULL,               \
+        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
-                &proc_##OTYPE##_operations, {})
+#define INF(NAME, MODE, read)                           \
-#define INF(NAME, MODE, OTYPE)                          \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_info_file_operations,       \
-                { .proc_read = &proc_##OTYPE } )
+                { .proc_read = read } )
-#define ONE(NAME, MODE, OTYPE)                          \
+#define ONE(NAME, MODE, show)                           \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_single_file_operations,     \
-                { .proc_show = &proc_##OTYPE } )
+                { .proc_show = show } )
 /*
 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                unsigned int nwords = 0;
-                do
+                do {
                        nwords += 2;
-                while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+                } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
                res = nwords * sizeof(mm->saved_auxv[0]);
                if (res > PAGE_SIZE)
                        res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
+#ifdef CONFIG_STACKTRACE
+#define MAX_STACK_TRACE_DEPTH   64
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
+{
+        struct stack_trace trace;
+        unsigned long *entries;
+        int i;
+        entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        trace.nr_entries        = 0;
+        trace.max_entries       = MAX_STACK_TRACE_DEPTH;
+        trace.entries           = entries;
+        trace.skip              = 0;
+        save_stack_trace_tsk(task, &trace);
+        for (i = 0; i < trace.nr_entries; i++) {
+                seq_printf(m, "[<%p>] %pS\n",
+                           (void *)entries[i], (void *)entries[i]);
+        }
+        kfree(entries);
+        return 0;
+}
+#endif
 #ifdef CONFIG_SCHEDSTATS
 /*
 * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
        struct inode *inode = m->private;
        struct task_struct *p;
-        WARN_ON(!inode);
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct task_struct *p;
-        WARN_ON(!inode);
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        if (!ei->pid)
                goto out_unlock;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        if (task_dumpable(task)) {
                rcu_read_lock();
                cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                                         const struct pid_entry *ents,
                                         unsigned int nents)
 {
-        struct inode *inode;
        struct dentry *error;
        struct task_struct *task = get_proc_task(dir);
        const struct pid_entry *p, *last;
        error = ERR_PTR(-ENOENT);
-        inode = NULL;
        if (!task)
                goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
 };
 static const struct pid_entry attr_dir_stuff[] = {
-        REG("current",    S_IRUGO|S_IWUGO, pid_attr),
+        REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("prev",       S_IRUGO,         pid_attr),
+        REG("prev",       S_IRUGO,         proc_pid_attr_operations),
-        REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
+        REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
+        REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
+        REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+        REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (!ei->pid)
                goto out_iput;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
                inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
-        DIR("task",       S_IRUGO|S_IXUGO, task),
+        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
-        DIR("fd",         S_IRUSR|S_IXUSR, fd),
+        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
+        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 #ifdef CONFIG_NET
-        DIR("net",        S_IRUGO|S_IXUGO, net),
+        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
-        REG("environ",    S_IRUSR, environ),
+        REG("environ",    S_IRUSR, proc_environ_operations),
-        INF("auxv",       S_IRUSR, pid_auxv),
+        INF("auxv",       S_IRUSR, proc_pid_auxv),
-        ONE("status",     S_IRUGO, pid_status),
+        ONE("status",     S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUSR, pid_limits),
+        INF("limits",     S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUSR, pid_syscall),
+        INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",    S_IRUGO, pid_cmdline),
+        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
-        ONE("stat",       S_IRUGO, tgid_stat),
+        ONE("stat",       S_IRUGO, proc_tgid_stat),
-        ONE("statm",      S_IRUGO, pid_statm),
+        ONE("statm",      S_IRUGO, proc_pid_statm),
-        REG("maps",       S_IRUGO, maps),
+        REG("maps",       S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-        REG("numa_maps",  S_IRUGO, numa_maps),
+        REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
 #endif
-        REG("mem",        S_IRUSR|S_IWUSR, mem),
+        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
-        LNK("cwd",        cwd),
+        LNK("cwd",        proc_cwd_link),
-        LNK("root",       root),
+        LNK("root",       proc_root_link),
-        LNK("exe",        exe),
+        LNK("exe",        proc_exe_link),
-        REG("mounts",     S_IRUGO, mounts),
+        REG("mounts",     S_IRUGO, proc_mounts_operations),
-        REG("mountinfo",  S_IRUGO, mountinfo),
+        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
-        REG("mountstats", S_IRUSR, mountstats),
+        REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-        REG("clear_refs", S_IWUSR, clear_refs),
+        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-        REG("smaps",      S_IRUGO, smaps),
+        REG("smaps",      S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, pagemap),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-        DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
+        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",      S_IRUGO, pid_wchan),
+        INF("wchan",      S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat",  S_IRUGO, pid_schedstat),
+        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-        REG("latency",  S_IRUGO, lstats),
+        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",     S_IRUGO, cpuset),
+        REG("cpuset",     S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, cgroup),
+        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score",  S_IRUGO, oom_score),
+        INF("oom_score",  S_IRUGO, proc_oom_score),
-        REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
+        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-        REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
+        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUGO, sessionid),
+        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-        REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, tgid_io_accounting),
+        INF("io",       S_IRUGO, proc_tgid_io_accounting),
 #endif
 };
@@ -2805,66 +2827,69 @@ out_no_task:
 * Tasks
 */
 static const struct pid_entry tid_base_stuff[] = {
-        DIR("fd",        S_IRUSR|S_IXUSR, fd),
+        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
+        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
-        REG("environ",   S_IRUSR, environ),
+        REG("environ",   S_IRUSR, proc_environ_operations),
-        INF("auxv",      S_IRUSR, pid_auxv),
+        INF("auxv",      S_IRUSR, proc_pid_auxv),
-        ONE("status",    S_IRUGO, pid_status),
+        ONE("status",    S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUSR, pid_limits),
+        INF("limits",    S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUSR, pid_syscall),
+        INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",   S_IRUGO, pid_cmdline),
+        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
-        ONE("stat",      S_IRUGO, tid_stat),
+        ONE("stat",      S_IRUGO, proc_tid_stat),
-        ONE("statm",     S_IRUGO, pid_statm),
+        ONE("statm",     S_IRUGO, proc_pid_statm),
-        REG("maps",      S_IRUGO, maps),
+        REG("maps",      S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-        REG("numa_maps", S_IRUGO, numa_maps),
+        REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
 #endif
-        REG("mem",       S_IRUSR|S_IWUSR, mem),
+        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
-        LNK("cwd",       cwd),
+        LNK("cwd",       proc_cwd_link),
-        LNK("root",      root),
+        LNK("root",      proc_root_link),
-        LNK("exe",       exe),
+        LNK("exe",       proc_exe_link),
-        REG("mounts",    S_IRUGO, mounts),
+        REG("mounts",    S_IRUGO, proc_mounts_operations),
-        REG("mountinfo",  S_IRUGO, mountinfo),
+        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-        REG("clear_refs", S_IWUSR, clear_refs),
+        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-        REG("smaps",     S_IRUGO, smaps),
+        REG("smaps",     S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, pagemap),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-        DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
+        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",     S_IRUGO, pid_wchan),
+        INF("wchan",     S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat", S_IRUGO, pid_schedstat),
+        INF("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-        REG("latency",  S_IRUGO, lstats),
+        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",    S_IRUGO, cpuset),
+        REG("cpuset",    S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, cgroup),
+        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score", S_IRUGO, oom_score),
+        INF("oom_score", S_IRUGO, proc_oom_score),
-        REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
+        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-        REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
+        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUSR, sessionid),
+        REG("sessionid",  S_IRUSR, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, tid_io_accounting),
+        INF("io",       S_IRUGO, proc_tid_io_accounting),
 #endif
 };
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/idr.h>
 #include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
        struct inode *inode = NULL;
        int error = -ENOENT;
-        lock_kernel();
        spin_lock(&proc_subdir_lock);
        for (de = de->subdir; de ; de = de->next) {
                if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
        }
        spin_unlock(&proc_subdir_lock);
 out_unlock:
-        unlock_kernel();
        if (inode) {
                dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret = 0;
-        lock_kernel();
        ino = inode->i_ino;
        i = filp->f_pos;
        switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
                        spin_unlock(&proc_subdir_lock);
        }
        ret = 1;
-out:    unlock_kernel();
+out:
        return ret;     
 }
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
 * the /proc directory.
 */
 static const struct file_operations proc_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = proc_readdir,
 };
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
 */
 void de_put(struct proc_dir_entry *de)
 {
-        lock_kernel();
        if (!atomic_read(&de->count)) {
                printk("de_put: entry %s already free!\n", de->name);
-                unlock_kernel();
                return;
        }
        if (atomic_dec_and_test(&de->count))
                free_proc_entry(de);
-        unlock_kernel();
 }
 /*
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 }
 const struct file_operations proc_net_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = proc_tgid_net_readdir,
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
-        inode->i_uid = inode->i_gid = 0;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
                inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
        unsigned int nr = filp->f_pos;
        int ret;
-        lock_kernel();
        if (nr < FIRST_PROCESS_ENTRY) {
                int error = proc_readdir(filp, dirent, filldir);
-                if (error <= 0) {
+                if (error <= 0)
-                        unlock_kernel();
                        return error;
-                }
                filp->f_pos = FIRST_PROCESS_ENTRY;
        }
-        unlock_kernel();
        ret = proc_pid_readdir(filp, dirent, filldir);
        return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Clean:  %8lu kB\n"
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
-                   "Swap:           %8lu kB\n",
+                   "Swap:           %8lu kB\n"
+                   "KernelPageSize: %8lu kB\n"
+                   "MMUPageSize:    %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_clean >> 10,
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
-                   mss.swap >> 10);
+                   mss.swap >> 10,
+                   vma_kernel_pagesize(vma) >> 10,
+                   vma_mmu_pagesize(vma) >> 10);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..d4a8be32b902 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,7 +9,7 @@
 /*
 * Logic: we've got two memory sums for each process, "shared", and
- * "non-shared". Shared memory may get counted more then once, for
+ * "non-shared". Shared memory may get counted more than once, for
 * each process that owns it. Non-shared memory is counted
 * accurately.
 */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
        offset = (unsigned long)(*ppos % PAGE_SIZE);
        pfn = (unsigned long)(*ppos / PAGE_SIZE);
-        if (pfn > saved_max_pfn)
-                return -EINVAL;
        do {
                if (count > (PAGE_SIZE - offset))
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..4a8c94f05f76 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
                case Q_SETQUOTA:
                case Q_GETQUOTA:
                        /* This is just informative test so we are satisfied without a lock */
-                        if (!sb_has_quota_enabled(sb, type))
+                        if (!sb_has_quota_active(sb, type))
                                return -ESRCH;
        }
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
        int cnt;
        sb->s_qcop->quota_sync(sb, type);
+        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+                return;
        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-                if (!sb_has_quota_enabled(sb, cnt))
+                if (!sb_has_quota_active(sb, cnt))
                        continue;
                mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
                truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
                for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                        if (type != -1 && type != cnt)
                                continue;
-                        if (!sb_has_quota_enabled(sb, cnt))
+                        if (!sb_has_quota_active(sb, cnt))
                                continue;
                        if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
                            list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
                        __u32 fmt;
                        down_read(&sb_dqopt(sb)->dqptr_sem);
-                        if (!sb_has_quota_enabled(sb, type)) {
+                        if (!sb_has_quota_active(sb, type)) {
                                up_read(&sb_dqopt(sb)->dqptr_sem);
                                return -ESRCH;
                        }
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ *      vfsv0 quota IO operations on file
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+#include <asm/byteorder.h>
+#include "quota_tree.h"
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+#define __QUOTA_QT_PARANOIA
+typedef char *dqbuf_t;
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+        unsigned int epb = info->dqi_usable_bs >> 2;
+        depth = info->dqi_qtree_depth - depth - 1;
+        while (depth--)
+                id /= epb;
+        return id % epb;
+}
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+        return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+               / info->dqi_entry_size;
+}
+static dqbuf_t getdqbuf(size_t size)
+{
+        dqbuf_t buf = kmalloc(size, GFP_NOFS);
+        if (!buf)
+                printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+        return buf;
+}
+static inline void freedqbuf(dqbuf_t buf)
+{
+        kfree(buf);
+}
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+        struct super_block *sb = info->dqi_sb;
+        memset(buf, 0, info->dqi_usable_bs);
+        return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+        struct super_block *sb = info->dqi_sb;
+        return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int ret, blk;
+        if (!buf)
+                return -ENOMEM;
+        if (info->dqi_free_blk) {
+                blk = info->dqi_free_blk;
+                ret = read_blk(info, blk, buf);
+                if (ret < 0)
+                        goto out_buf;
+                info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+        }
+        else {
+                memset(buf, 0, info->dqi_usable_bs);
+                /* Assure block allocation... */
+                ret = write_blk(info, info->dqi_blocks, buf);
+                if (ret < 0)
+                        goto out_buf;
+                blk = info->dqi_blocks++;
+        }
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        ret = blk;
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int err;
+        dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+        dh->dqdh_prev_free = cpu_to_le32(0);
+        dh->dqdh_entries = cpu_to_le16(0);
+        err = write_blk(info, blk, buf);
+        if (err < 0)
+                return err;
+        info->dqi_free_blk = blk;
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        return 0;
+}
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+        uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+        int err;
+        if (!tmpbuf)
+                return -ENOMEM;
+        if (nextblk) {
+                err = read_blk(info, nextblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+                                                        dh->dqdh_prev_free;
+                err = write_blk(info, nextblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        }
+        if (prevblk) {
+                err = read_blk(info, prevblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+                                                        dh->dqdh_next_free;
+                err = write_blk(info, prevblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        } else {
+                info->dqi_free_entry = nextblk;
+                mark_info_dirty(info->dqi_sb, info->dqi_type);
+        }
+        freedqbuf(tmpbuf);
+        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+        /* No matter whether write succeeds block is out of list */
+        if (write_blk(info, blk, buf) < 0)
+                printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+        return 0;
+out_buf:
+        freedqbuf(tmpbuf);
+        return err;
+}
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int err;
+        if (!tmpbuf)
+                return -ENOMEM;
+        dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+        dh->dqdh_prev_free = cpu_to_le32(0);
+        err = write_blk(info, blk, buf);
+        if (err < 0)
+                goto out_buf;
+        if (info->dqi_free_entry) {
+                err = read_blk(info, info->dqi_free_entry, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+                                                        cpu_to_le32(blk);
+                err = write_blk(info, info->dqi_free_entry, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        }
+        freedqbuf(tmpbuf);
+        info->dqi_free_entry = blk;
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        return 0;
+out_buf:
+        freedqbuf(tmpbuf);
+        return err;
+}
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+        int i;
+        for (i = 0; i < info->dqi_entry_size; i++)
+                if (disk[i])
+                        return 0;
+        return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+                              struct dquot *dquot, int *err)
+{
+        uint blk, i;
+        struct qt_disk_dqdbheader *dh;
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        char *ddquot;
+        *err = 0;
+        if (!buf) {
+                *err = -ENOMEM;
+                return 0;
+        }
+        dh = (struct qt_disk_dqdbheader *)buf;
+        if (info->dqi_free_entry) {
+                blk = info->dqi_free_entry;
+                *err = read_blk(info, blk, buf);
+                if (*err < 0)
+                        goto out_buf;
+        } else {
+                blk = get_free_dqblk(info);
+                if ((int)blk < 0) {
+                        *err = blk;
+                        freedqbuf(buf);
+                        return 0;
+                }
+                memset(buf, 0, info->dqi_usable_bs);
+                /* This is enough as block is already zeroed and entry list is empty... */
+                info->dqi_free_entry = blk;
+                mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+        }
+        /* Block will be full? */
+        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+                *err = remove_free_dqentry(info, buf, blk);
+                if (*err < 0) {
+                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                               "remove block (%u) from entry free list.\n",
+                               blk);
+                        goto out_buf;
+                }
+        }
+        le16_add_cpu(&dh->dqdh_entries, 1);
+        /* Find free structure in block */
+        for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+             i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+             i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+        if (i == qtree_dqstr_in_blk(info)) {
+                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+                                "but it shouldn't.\n");
+                *err = -EIO;
+                goto out_buf;
+        }
+#endif
+        *err = write_blk(info, blk, buf);
+        if (*err < 0) {
+                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                                "data block %u.\n", blk);
+                goto out_buf;
+        }
+        dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+                        sizeof(struct qt_disk_dqdbheader) +
+                        i * info->dqi_entry_size;
+        freedqbuf(buf);
+        return blk;
+out_buf:
+        freedqbuf(buf);
+        return 0;
+}
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                          uint *treeblk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0, newson = 0, newact = 0;
+        __le32 *ref;
+        uint newblk;
+        if (!buf)
+                return -ENOMEM;
+        if (!*treeblk) {
+                ret = get_free_dqblk(info);
+                if (ret < 0)
+                        goto out_buf;
+                *treeblk = ret;
+                memset(buf, 0, info->dqi_usable_bs);
+                newact = 1;
+        } else {
+                ret = read_blk(info, *treeblk, buf);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Can't read tree quota block "
+                                        "%u.\n", *treeblk);
+                        goto out_buf;
+                }
+        }
+        ref = (__le32 *)buf;
+        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (!newblk)
+                newson = 1;
+        if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+                if (newblk) {
+                        printk(KERN_ERR "VFS: Inserting already present quota "
+                                        "entry (block %u).\n",
+                               le32_to_cpu(ref[get_index(info,
+                                                dquot->dq_id, depth)]));
+                        ret = -EIO;
+                        goto out_buf;
+                }
+#endif
+                newblk = find_free_dqentry(info, dquot, &ret);
+        } else {
+                ret = do_insert_tree(info, dquot, &newblk, depth+1);
+        }
+        if (newson && ret >= 0) {
+                ref[get_index(info, dquot->dq_id, depth)] =
+                                                        cpu_to_le32(newblk);
+                ret = write_blk(info, *treeblk, buf);
+        } else if (newact && ret < 0) {
+                put_free_dqblk(info, buf, *treeblk);
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+                                 struct dquot *dquot)
+{
+        int tmp = QT_TREEOFF;
+        return do_insert_tree(info, dquot, &tmp, 0);
+}
+/*
+ *      We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        int type = dquot->dq_type;
+        struct super_block *sb = dquot->dq_sb;
+        ssize_t ret;
+        dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+        if (!ddquot)
+                return -ENOMEM;
+        /* dq_off is guarded by dqio_mutex */
+        if (!dquot->dq_off) {
+                ret = dq_insert_tree(info, dquot);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Error %zd occurred while "
+                                        "creating quota.\n", ret);
+                        freedqbuf(ddquot);
+                        return ret;
+                }
+        }
+        spin_lock(&dq_data_lock);
+        info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+        spin_unlock(&dq_data_lock);
+        ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+                                        info->dqi_entry_size, dquot->dq_off);
+        if (ret != info->dqi_entry_size) {
+                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                       sb->s_id);
+                if (ret >= 0)
+                        ret = -ENOSPC;
+        } else {
+                ret = 0;
+        }
+        dqstats.writes++;
+        freedqbuf(ddquot);
+        return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                        uint blk)
+{
+        struct qt_disk_dqdbheader *dh;
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0;
+        if (!buf)
+                return -ENOMEM;
+        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+                printk(KERN_ERR "VFS: Quota structure has offset to other "
+                  "block (%u) than it should (%u).\n", blk,
+                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+                goto out_buf;
+        }
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                goto out_buf;
+        }
+        dh = (struct qt_disk_dqdbheader *)buf;
+        le16_add_cpu(&dh->dqdh_entries, -1);
+        if (!le16_to_cpu(dh->dqdh_entries)) {   /* Block got free? */
+                ret = remove_free_dqentry(info, buf, blk);
+                if (ret >= 0)
+                        ret = put_free_dqblk(info, buf, blk);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+                          "to free list.\n", blk);
+                        goto out_buf;
+                }
+        } else {
+                memset(buf +
+                       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+                       0, info->dqi_entry_size);
+                if (le16_to_cpu(dh->dqdh_entries) ==
+                    qtree_dqstr_in_blk(info) - 1) {
+                        /* Insert will write block itself */
+                        ret = insert_free_dqentry(info, buf, blk);
+                        if (ret < 0) {
+                                printk(KERN_ERR "VFS: Can't insert quota data "
+                                       "block (%u) to free entry list.\n", blk);
+                                goto out_buf;
+                        }
+                } else {
+                        ret = write_blk(info, blk, buf);
+                        if (ret < 0) {
+                                printk(KERN_ERR "VFS: Can't write quota data "
+                                  "block %u\n", blk);
+                                goto out_buf;
+                        }
+                }
+        }
+        dquot->dq_off = 0;      /* Quota is now unattached */
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                       uint *blk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0;
+        uint newblk;
+        __le32 *ref = (__le32 *)buf;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, *blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                goto out_buf;
+        }
+        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (depth == info->dqi_qtree_depth - 1) {
+                ret = free_dqentry(info, dquot, newblk);
+                newblk = 0;
+        } else {
+                ret = remove_tree(info, dquot, &newblk, depth+1);
+        }
+        if (ret >= 0 && !newblk) {
+                int i;
+                ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+                /* Block got empty? */
+                for (i = 0;
+                     i < (info->dqi_usable_bs >> 2) && !ref[i];
+                     i++);
+                /* Don't put the root block into the free block list */
+                if (i == (info->dqi_usable_bs >> 2)
+                    && *blk != QT_TREEOFF) {
+                        put_free_dqblk(info, buf, *blk);
+                        *blk = 0;
+                } else {
+                        ret = write_blk(info, *blk, buf);
+                        if (ret < 0)
+                                printk(KERN_ERR "VFS: Can't write quota tree "
+                                  "block %u.\n", *blk);
+                }
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        uint tmp = QT_TREEOFF;
+        if (!dquot->dq_off)     /* Even not allocated? */
+                return 0;
+        return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+                                 struct dquot *dquot, uint blk)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        loff_t ret = 0;
+        int i;
+        char *ddquot;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                goto out_buf;
+        }
+        for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+             i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+             i++, ddquot += info->dqi_entry_size);
+        if (i == qtree_dqstr_in_blk(info)) {
+                printk(KERN_ERR "VFS: Quota for id %u referenced "
+                  "but not present.\n", dquot->dq_id);
+                ret = -EIO;
+                goto out_buf;
+        } else {
+                ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+                  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+                                struct dquot *dquot, uint blk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        loff_t ret = 0;
+        __le32 *ref = (__le32 *)buf;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                goto out_buf;
+        }
+        ret = 0;
+        blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (!blk)       /* No reference? */
+                goto out_buf;
+        if (depth < info->dqi_qtree_depth - 1)
+                ret = find_tree_dqentry(info, dquot, blk, depth+1);
+        else
+                ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+                                  struct dquot *dquot)
+{
+        return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        int type = dquot->dq_type;
+        struct super_block *sb = dquot->dq_sb;
+        loff_t offset;
+        dqbuf_t ddquot;
+        int ret = 0;
+#ifdef __QUOTA_QT_PARANOIA
+        /* Invalidated quota? */
+        if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+                return -EIO;
+        }
+#endif
+        /* Do we know offset of the dquot entry in the quota file? */
+        if (!dquot->dq_off) {
+                offset = find_dqentry(info, dquot);
+                if (offset <= 0) {      /* Entry not present? */
+                        if (offset < 0)
+                                printk(KERN_ERR "VFS: Can't read quota "
+                                  "structure for id %u.\n", dquot->dq_id);
+                        dquot->dq_off = 0;
+                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
+                        memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+                        ret = offset;
+                        goto out;
+                }
+                dquot->dq_off = offset;
+        }
+        ddquot = getdqbuf(info->dqi_entry_size);
+        if (!ddquot)
+                return -ENOMEM;
+        ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+                                   info->dqi_entry_size, dquot->dq_off);
+        if (ret != info->dqi_entry_size) {
+                if (ret >= 0)
+                        ret = -EIO;
+                printk(KERN_ERR "VFS: Error while reading quota "
+                                "structure for id %u.\n", dquot->dq_id);
+                set_bit(DQ_FAKE_B, &dquot->dq_flags);
+                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+                freedqbuf(ddquot);
+                goto out;
+        }
+        spin_lock(&dq_data_lock);
+        info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+        if (!dquot->dq_dqb.dqb_bhardlimit &&
+            !dquot->dq_dqb.dqb_bsoftlimit &&
+            !dquot->dq_dqb.dqb_ihardlimit &&
+            !dquot->dq_dqb.dqb_isoftlimit)
+                set_bit(DQ_FAKE_B, &dquot->dq_flags);
+        spin_unlock(&dq_data_lock);
+        freedqbuf(ddquot);
+out:
+        dqstats.reads++;
+        return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+                return qtree_delete_dquot(info, dquot);
+        return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *      Definitions of structures for vfsv0 quota format
+ */
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+#include <linux/types.h>
+#include <linux/quota.h>
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+        __le32 dqdh_next_free;  /* Number of next block with free entry */
+        __le32 dqdh_prev_free;  /* Number of previous block with free entry */
+        __le16 dqdh_entries;    /* Number of valid entries in block */
+        __le16 dqdh_pad1;
+        __le32 dqdh_pad2;
+};
+#define QT_TREEOFF      1               /* Offset of tree in file in blocks */
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <asm/byteorder.h>
+#include "quotaio_v1.h"
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+        return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+        return blocks << QUOTABLOCK_BITS;
+}
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 {
        m->dqb_ihardlimit = d->dqb_ihardlimit;
        m->dqb_isoftlimit = d->dqb_isoftlimit;
        m->dqb_curinodes = d->dqb_curinodes;
-        m->dqb_bhardlimit = d->dqb_bhardlimit;
+        m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
-        m->dqb_bsoftlimit = d->dqb_bsoftlimit;
+        m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
-        m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+        m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
        m->dqb_itime = d->dqb_itime;
        m->dqb_btime = d->dqb_btime;
 }
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
        d->dqb_ihardlimit = m->dqb_ihardlimit;
        d->dqb_isoftlimit = m->dqb_isoftlimit;
        d->dqb_curinodes = m->dqb_curinodes;
-        d->dqb_bhardlimit = m->dqb_bhardlimit;
+        d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
-        d->dqb_bsoftlimit = m->dqb_bsoftlimit;
+        d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
-        d->dqb_curblocks = toqb(m->dqb_curspace);
+        d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
        d->dqb_itime = m->dqb_itime;
        d->dqb_btime = m->dqb_btime;
 }
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -15,16 +14,37 @@
 #include <asm/byteorder.h>
+#include "quota_tree.h"
+#include "quotaio_v2.h"
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_LICENSE("GPL");
 #define __QUOTA_V2_PARANOIA
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+static struct qtree_fmt_operations v2_qtree_ops = {
+        .mem2disk_dqblk = v2_mem2diskdqb,
+        .disk2mem_dqblk = v2_disk2memdqb,
+        .is_id = v2_is_id,
+};
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
+static inline qsize_t v2_stoqb(qsize_t space)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+{
+        return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+        return blocks << QUOTABLOCK_BITS;
+}
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *qinfo;
        ssize_t size;
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
                        sb->s_id);
                return -1;
        }
+        info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+        if (!info->dqi_priv) {
+                printk(KERN_WARNING
+                       "Not enough memory for quota information structure.\n");
+                return -1;
+        }
+        qinfo = info->dqi_priv;
        /* limits are stored as unsigned 32-bit data */
        info->dqi_maxblimit = 0xffffffff;
        info->dqi_maxilimit = 0xffffffff;
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-        info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        qinfo->dqi_sb = sb;
-        info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        qinfo->dqi_type = type;
-        info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+        qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+        qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+        qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+        qinfo->dqi_ops = &v2_qtree_ops;
        return 0;
 }
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
        ssize_t size;
        spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
        dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
        spin_unlock(&dq_data_lock);
-        dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
+        dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
-        dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
+        dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
-        dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+        dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
        return 0;
 }
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 {
+        struct v2_disk_dqblk *d = dp, empty;
+        struct mem_dqblk *m = &dquot->dq_dqb;
        m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
        m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
        m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
        m->dqb_itime = le64_to_cpu(d->dqb_itime);
-        m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
+        m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
-        m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+        m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
        m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
        m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        /* We need to escape back all-zero structure */
+        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+        empty.dqb_itime = cpu_to_le64(1);
+        if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+                m->dqb_itime = 0;
 }
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 {
+        struct v2_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        struct qtree_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
        d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
        d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
        d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
        d->dqb_itime = cpu_to_le64(m->dqb_itime);
-        d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
+        d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
-        d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+        d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
        d->dqb_btime = cpu_to_le64(m->dqb_btime);
-        d->dqb_id = cpu_to_le32(id);
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
-}
+        if (qtree_entry_unused(info, dp))
+                d->dqb_itime = cpu_to_le64(1);
-static dqbuf_t getdqbuf(void)
-{
-        dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
-        if (!buf)
-                printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
-        return buf;
-}
-static inline void freedqbuf(dqbuf_t buf)
-{
-        kfree(buf);
-}
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-        memset(buf, 0, V2_DQBLKSIZE);
-        return sb->s_op->quota_read(sb, type, (char *)buf,
-               V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-        return sb->s_op->quota_write(sb, type, (char *)buf,
-               V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
-        dqbuf_t buf = getdqbuf();
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int ret, blk;
-        if (!buf)
-                return -ENOMEM;
-        if (info->u.v2_i.dqi_free_blk) {
-                blk = info->u.v2_i.dqi_free_blk;
-                if ((ret = read_blk(sb, type, blk, buf)) < 0)
-                        goto out_buf;
-                info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
-        }
-        else {
-                memset(buf, 0, V2_DQBLKSIZE);
-                /* Assure block allocation... */
-                if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
-                        goto out_buf;
-                blk = info->u.v2_i.dqi_blocks++;
-        }
-        mark_info_dirty(sb, type);
-        ret = blk;
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int err;
-        dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-        dh->dqdh_prev_free = cpu_to_le32(0);
-        dh->dqdh_entries = cpu_to_le16(0);
-        info->u.v2_i.dqi_free_blk = blk;
-        mark_info_dirty(sb, type);
-        /* Some strange block. We had better leave it... */
-        if ((err = write_blk(sb, type, blk, buf)) < 0)
-                return err;
-        return 0;
 }
-/* Remove given block from the list of blocks with free entries */
+static int v2_is_id(void *dp, struct dquot *dquot)
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
 {
-        dqbuf_t tmpbuf = getdqbuf();
+        struct v2_disk_dqblk *d = dp;
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *info =
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-        uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
-        int err;
-        if (!tmpbuf)
+        if (qtree_entry_unused(info, dp))
-                return -ENOMEM;
-        if (nextblk) {
-                if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
-                if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        if (prevblk) {
-                if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
-                if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        else {
-                info->u.v2_i.dqi_free_entry = nextblk;
-                mark_info_dirty(sb, type);
-        }
-        freedqbuf(tmpbuf);
-        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
-        /* No matter whether write succeeds block is out of list */
-        if (write_blk(sb, type, blk, buf) < 0)
-                printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
-        return 0;
-out_buf:
-        freedqbuf(tmpbuf);
-        return err;
-}
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-        dqbuf_t tmpbuf = getdqbuf();
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int err;
-        if (!tmpbuf)
-                return -ENOMEM;
-        dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
-        dh->dqdh_prev_free = cpu_to_le32(0);
-        if ((err = write_blk(sb, type, blk, buf)) < 0)
-                goto out_buf;
-        if (info->u.v2_i.dqi_free_entry) {
-                if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
-                if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        freedqbuf(tmpbuf);
-        info->u.v2_i.dqi_free_entry = blk;
-        mark_info_dirty(sb, type);
-        return 0;
-out_buf:
-        freedqbuf(tmpbuf);
-        return err;
-}
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
-        struct super_block *sb = dquot->dq_sb;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-        uint blk, i;
-        struct v2_disk_dqdbheader *dh;
-        struct v2_disk_dqblk *ddquot;
-        struct v2_disk_dqblk fakedquot;
-        dqbuf_t buf;
-        *err = 0;
-        if (!(buf = getdqbuf())) {
-                *err = -ENOMEM;
                return 0;
-        }
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
-        dh = (struct v2_disk_dqdbheader *)buf;
-        ddquot = GETENTRIES(buf);
-        if (info->u.v2_i.dqi_free_entry) {
-                blk = info->u.v2_i.dqi_free_entry;
-                if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
-                        goto out_buf;
-        }
-        else {
-                blk = get_free_dqblk(sb, dquot->dq_type);
-                if ((int)blk < 0) {
-                        *err = blk;
-                        freedqbuf(buf);
-                        return 0;
-                }
-                memset(buf, 0, V2_DQBLKSIZE);
-                /* This is enough as block is already zeroed and entry list is empty... */
-                info->u.v2_i.dqi_free_entry = blk;
-                mark_info_dirty(sb, dquot->dq_type);
-        }
-        if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)   /* Block will be full? */
-                if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
-                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
-                        goto out_buf;
-                }
-        le16_add_cpu(&dh->dqdh_entries, 1);
-        memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-        /* Find free structure in block */
-        for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
-        if (i == V2_DQSTRINBLK) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
-                *err = -EIO;
-                goto out_buf;
-        }
-#endif
-        if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
-                goto out_buf;
-        }
-        dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-        freedqbuf(buf);
-        return blk;
-out_buf:
-        freedqbuf(buf);
-        return 0;
-}
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
-        struct super_block *sb = dquot->dq_sb;
-        dqbuf_t buf;
-        int ret = 0, newson = 0, newact = 0;
-        __le32 *ref;
-        uint newblk;
-        if (!(buf = getdqbuf()))
-                return -ENOMEM;
-        if (!*treeblk) {
-                ret = get_free_dqblk(sb, dquot->dq_type);
-                if (ret < 0)
-                        goto out_buf;
-                *treeblk = ret;
-                memset(buf, 0, V2_DQBLKSIZE);
-                newact = 1;
-        }
-        else {
-                if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
-                        printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
-                        goto out_buf;
-                }
-        }
-        ref = (__le32 *)buf;
-        newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (!newblk)
-                newson = 1;
-        if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
-                if (newblk) {
-                        printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
-                        ret = -EIO;
-                        goto out_buf;
-                }
-#endif
-                newblk = find_free_dqentry(dquot, &ret);
-        }
-        else
-                ret = do_insert_tree(dquot, &newblk, depth+1);
-        if (newson && ret >= 0) {
-                ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
-                ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
-        }
-        else if (newact && ret < 0)
-                put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
 }
-/* Wrapper for inserting quota structure into tree */
+static int v2_read_dquot(struct dquot *dquot)
-static inline int dq_insert_tree(struct dquot *dquot)
 {
-        int tmp = V2_DQTREEOFF;
+        return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        return do_insert_tree(dquot, &tmp, 0);
 }
-/*
- *      We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
 static int v2_write_dquot(struct dquot *dquot)
 {
-        int type = dquot->dq_type;
+        return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        ssize_t ret;
-        struct v2_disk_dqblk ddquot, empty;
-        /* dq_off is guarded by dqio_mutex */
-        if (!dquot->dq_off)
-                if ((ret = dq_insert_tree(dquot)) < 0) {
-                        printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
-                        return ret;
-                }
-        spin_lock(&dq_data_lock);
-        mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-        /* Argh... We may need to write structure full of zeroes but that would be
-         * treated as an empty place by the rest of the code. Format change would
-         * be definitely cleaner but the problems probably are not worth it */
-        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-        if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-                ddquot.dqb_itime = cpu_to_le64(1);
-        spin_unlock(&dq_data_lock);
-        ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-              (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-        if (ret != sizeof(struct v2_disk_dqblk)) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-                if (ret >= 0)
-                        ret = -ENOSPC;
-        }
-        else
-                ret = 0;
-        dqstats.writes++;
-        return ret;
 }
-/* Free dquot entry in data block */
+static int v2_release_dquot(struct dquot *dquot)
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        struct v2_disk_dqdbheader *dh;
-        dqbuf_t buf = getdqbuf();
-        int ret = 0;
-        if (!buf)
-                return -ENOMEM;
-        if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
-                printk(KERN_ERR "VFS: Quota structure has offset to other "
-                  "block (%u) than it should (%u).\n", blk,
-                  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
-                goto out_buf;
-        }
-        if ((ret = read_blk(sb, type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
-                goto out_buf;
-        }
-        dh = (struct v2_disk_dqdbheader *)buf;
-        le16_add_cpu(&dh->dqdh_entries, -1);
-        if (!le16_to_cpu(dh->dqdh_entries)) {   /* Block got free? */
-                if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
-                    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
-                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
-                          "to free list.\n", blk);
-                        goto out_buf;
-                }
-        }
-        else {
-                memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-                  sizeof(struct v2_disk_dqblk));
-                if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-                        /* Insert will write block itself */
-                        if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
-                                printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-                                goto out_buf;
-                        }
-                }
-                else
-                        if ((ret = write_blk(sb, type, blk, buf)) < 0) {
-                                printk(KERN_ERR "VFS: Can't write quota data "
-                                  "block %u\n", blk);
-                                goto out_buf;
-                        }
-        }
-        dquot->dq_off = 0;      /* Quota is now unattached */
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        dqbuf_t buf = getdqbuf();
-        int ret = 0;
-        uint newblk;
-        __le32 *ref = (__le32 *)buf;
-        
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
-                goto out_buf;
-        }
-        newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (depth == V2_DQTREEDEPTH-1) {
-                ret = free_dqentry(dquot, newblk);
-                newblk = 0;
-        }
-        else
-                ret = remove_tree(dquot, &newblk, depth+1);
-        if (ret >= 0 && !newblk) {
-                int i;
-                ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
-                for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);  /* Block got empty? */
-                /* Don't put the root block into the free block list */
-                if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
-                        put_free_dqblk(sb, type, buf, *blk);
-                        *blk = 0;
-                }
-                else
-                        if ((ret = write_blk(sb, type, *blk, buf)) < 0)
-                                printk(KERN_ERR "VFS: Can't write quota tree "
-                                  "block %u.\n", *blk);
-        }
-out_buf:
-        freedqbuf(buf);
-        return ret;     
-}
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
-        uint tmp = V2_DQTREEOFF;
-        if (!dquot->dq_off)     /* Even not allocated? */
-                return 0;
-        return remove_tree(dquot, &tmp, 0);
-}
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
-        dqbuf_t buf = getdqbuf();
-        loff_t ret = 0;
-        int i;
-        struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-                goto out_buf;
-        }
-        if (dquot->dq_id)
-                for (i = 0; i < V2_DQSTRINBLK &&
-                     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-        else {  /* ID 0 as a bit more complicated searching... */
-                struct v2_disk_dqblk fakedquot;
-                memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-                for (i = 0; i < V2_DQSTRINBLK; i++)
-                        if (!le32_to_cpu(ddquot[i].dqb_id) &&
-                            memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-                                break;
-        }
-        if (i == V2_DQSTRINBLK) {
-                printk(KERN_ERR "VFS: Quota for id %u referenced "
-                  "but not present.\n", dquot->dq_id);
-                ret = -EIO;
-                goto out_buf;
-        }
-        else
-                ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-                  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
-        dqbuf_t buf = getdqbuf();
-        loff_t ret = 0;
-        __le32 *ref = (__le32 *)buf;
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-                goto out_buf;
-        }
-        ret = 0;
-        blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (!blk)       /* No reference? */
-                goto out_buf;
-        if (depth < V2_DQTREEDEPTH-1)
-                ret = find_tree_dqentry(dquot, blk, depth+1);
-        else
-                ret = find_block_dqentry(dquot, blk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
-        return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-static int v2_read_dquot(struct dquot *dquot)
 {
-        int type = dquot->dq_type;
+        return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        loff_t offset;
-        struct v2_disk_dqblk ddquot, empty;
-        int ret = 0;
-#ifdef __QUOTA_V2_PARANOIA
-        /* Invalidated quota? */
-        if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
-                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
-                return -EIO;
-        }
-#endif
-        offset = find_dqentry(dquot);
-        if (offset <= 0) {      /* Entry not present? */
-                if (offset < 0)
-                        printk(KERN_ERR "VFS: Can't read quota "
-                          "structure for id %u.\n", dquot->dq_id);
-                dquot->dq_off = 0;
-                set_bit(DQ_FAKE_B, &dquot->dq_flags);
-                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-                ret = offset;
-        }
-        else {
-                dquot->dq_off = offset;
-                if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-                    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-                    != sizeof(struct v2_disk_dqblk)) {
-                        if (ret >= 0)
-                                ret = -EIO;
-                        printk(KERN_ERR "VFS: Error while reading quota "
-                          "structure for id %u.\n", dquot->dq_id);
-                        memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-                }
-                else {
-                        ret = 0;
-                        /* We need to escape back all-zero structure */
-                        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-                        empty.dqb_itime = cpu_to_le64(1);
-                        if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-                                ddquot.dqb_itime = 0;
-                }
-                disk2memdqb(&dquot->dq_dqb, &ddquot);
-                if (!dquot->dq_dqb.dqb_bhardlimit &&
-                        !dquot->dq_dqb.dqb_bsoftlimit &&
-                        !dquot->dq_dqb.dqb_ihardlimit &&
-                        !dquot->dq_dqb.dqb_isoftlimit)
-                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
-        }
-        dqstats.reads++;
-        return ret;
 }
-/* Check whether dquot should not be deleted. We know we are
+static int v2_free_file_info(struct super_block *sb, int type)
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
 {
-        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+        kfree(sb_dqinfo(sb, type)->dqi_priv);
-                return v2_delete_dquot(dquot);
        return 0;
 }
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
        .check_quota_file       = v2_check_quota_file,
        .read_file_info         = v2_read_file_info,
        .write_file_info        = v2_write_file_info,
-        .free_file_info         = NULL,
+        .free_file_info         = v2_free_file_info,
        .read_dqblk             = v2_read_dquot,
        .commit_dqblk           = v2_write_dquot,
        .release_dqblk          = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+#include <linux/types.h>
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+        __u32 dqb_bhardlimit;   /* absolute limit on disk blks alloc */
+        __u32 dqb_bsoftlimit;   /* preferred limit on disk blks */
+        __u32 dqb_curblocks;    /* current block count */
+        __u32 dqb_ihardlimit;   /* absolute limit on allocated inodes */
+        __u32 dqb_isoftlimit;   /* preferred inode limit */
+        __u32 dqb_curinodes;    /* current # allocated inodes */
+        time_t dqb_btime;       /* time limit for excessive disk use */
+        time_t dqb_itime;       /* time limit for excessive inode use */
+};
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+#endif  /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
+/*
+ *      Definitions of structures for vfsv0 quota format
+ */
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+#include <linux/types.h>
+#include <linux/quota.h>
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+        0xd9c01f11,     /* USRQUOTA */\
+        0xd9c01927      /* GRPQUOTA */\
+}
+#define V2_INITQVERSIONS {\
+        0,              /* USRQUOTA */\
+        0               /* GRPQUOTA */\
+}
+/* First generic header */
+struct v2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* File version */
+};
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+        __le32 dqb_id;          /* id this quota applies to */
+        __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+        __le32 dqb_isoftlimit;  /* preferred inode limit */
+        __le32 dqb_curinodes;   /* current # allocated inodes */
+        __le32 dqb_bhardlimit;  /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+        __le32 dqb_bsoftlimit;  /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+        __le64 dqb_curspace;    /* current space occupied (in bytes) */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+        __le64 dqb_itime;       /* time limit for excessive inode use */
+};
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+        __le32 dqi_bgrace;      /* Time before block soft limit becomes hard limit */
+        __le32 dqi_igrace;      /* Time before inode soft limit becomes hard limit */
+        __le32 dqi_flags;       /* Flags for quotafile (DQF_*) */
+        __le32 dqi_blocks;      /* Number of blocks in file */
+        __le32 dqi_free_blk;    /* Number of first free block in the list */
+        __le32 dqi_free_entry;  /* Number of block with at least one free entry */
+};
+#define V2_DQINFOOFF    sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10                            /* Size of leaf block in tree */
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..5cc6924eb158 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                offset += inode->i_size;
                break;
        case SEEK_CUR:
+                /*
+                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+                 * position-querying operation.  Avoid rewriting the "same"
+                 * f_pos value back to the file because a concurrent read(),
+                 * write() or lseek() might have altered it
+                 */
+                if (offset == 0)
+                        return file->f_pos;
                offset += file->f_pos;
                break;
        }
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += i_size_read(file->f_path.dentry->d_inode);
                        break;
                case SEEK_CUR:
+                        if (offset == 0) {
+                                retval = file->f_pos;
+                                goto out;
+                        }
                        offset += file->f_pos;
        }
        retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                }
                retval = offset;
        }
+out:
        unlock_kernel();
        return retval;
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ed04f47007f8..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,6 +1782,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                goto out_bad_inode;
        }
        args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+        if (old_format_only(sb))
+                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+                                  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+        else
+                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
        args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
        if (insert_inode_locked4(inode, args.objectid,
@@ -1834,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
-        if (old_format_only(sb))
-                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-                                  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-        else
-                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
        /* key to search for correct place for new stat data */
        _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
                      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..c55651f1407c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
        .release_dquot = reiserfs_release_dquot,
        .mark_dirty = reiserfs_mark_dquot_dirty,
        .write_info = reiserfs_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                if (c == 'u' || c == 'g') {
                        int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-                        if ((sb_any_quota_enabled(s) ||
+                        if (sb_any_quota_loaded(s) &&
-                             sb_any_quota_suspended(s)) &&
                            (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                                                 "reiserfs_parse_options: unknown quota format specified.");
                                return 0;
                        }
-                        if ((sb_any_quota_enabled(s) ||
+                        if (sb_any_quota_loaded(s) &&
-                             sb_any_quota_suspended(s)) &&
                            *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
        }
        /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
        if (!(*mount_options & (1 << REISERFS_QUOTA))
-            && sb_any_quota_enabled(s)) {
+            && sb_any_quota_loaded(s)) {
                reiserfs_warning(s,
                                 "reiserfs_parse_options: quota options must be present when quota is turned on.");
                return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
 static struct inode *
 romfs_iget(struct super_block *sb, unsigned long ino)
 {
-        int nextfh;
+        int nextfh, ret;
        struct romfs_inode ri;
        struct inode *i;
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
        i->i_size = be32_to_cpu(ri.size);
        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-        i->i_uid = i->i_gid = 0;
        /* Precalculate the data offset */
-        ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
+        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ino >= 0)
+        if (ret >= 0)
-                ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
+                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
+        else
-                ino = 0;
+                ino = 0;
        ROMFS_I(i)->i_metasize = ino;
        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 void poll_initwait(struct poll_wqueues *pwq)
 {
        init_poll_funcptr(&pwq->pt, __pollwait);
+        pwq->polling_task = current;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
 }
 EXPORT_SYMBOL(poll_initwait);
 static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
                free_page((unsigned long) old);
        }
 }
 EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 {
-        struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
        struct poll_table_page *table = p->table;
        if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                if (!new_table) {
                        p->error = -ENOMEM;
-                        __set_current_state(TASK_RUNNING);
                        return NULL;
                }
                new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
        return table->entry++;
 }
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+        struct poll_wqueues *pwq = wait->private;
+        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+        /*
+         * Although this function is called under waitqueue lock, LOCK
+         * doesn't imply write barrier and the users expect write
+         * barrier semantics on wakeup functions.  The following
+         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+         * and is paired with set_mb() in poll_schedule_timeout.
+         */
+        smp_wmb();
+        pwq->triggered = 1;
+        /*
+         * Perform the default wake up operation using a dummy
+         * waitqueue.
+         *
+         * TODO: This is hacky but there currently is no interface to
+         * pass in @sync.  @sync is scheduled to be removed and once
+         * that happens, wake_up_process() can be used directly.
+         */
+        return default_wake_function(&dummy_wait, mode, sync, key);
+}
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
 {
-        struct poll_table_entry *entry = poll_get_entry(p);
+        struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+        struct poll_table_entry *entry = poll_get_entry(pwq);
        if (!entry)
                return;
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
-        init_waitqueue_entry(&entry->wait, current);
+        init_waitqueue_func_entry(&entry->wait, pollwake);
+        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
 }
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+                          ktime_t *expires, unsigned long slack)
+{
+        int rc = -EINTR;
+        set_current_state(state);
+        if (!pwq->triggered)
+                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+        __set_current_state(TASK_RUNNING);
+        /*
+         * Prepare for the next iteration.
+         *
+         * The following set_mb() serves two purposes.  First, it's
+         * the counterpart rmb of the wmb in pollwake() such that data
+         * written before wake up is always visible after wake up.
+         * Second, the full barrier guarantees that triggered clearing
+         * doesn't pass event check of the next iteration.  Note that
+         * this problem doesn't exist for the first iteration as
+         * add_wait_queue() has full barrier semantics.
+         */
+        set_mb(pwq->triggered, 0);
+        return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
 /**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:         pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-                set_current_state(TASK_INTERRUPTIBLE);
                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                        to = &expire;
                }
-                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+                                           to, slack))
                        timed_out = 1;
        }
-        __set_current_state(TASK_RUNNING);
        poll_freewait(&table);
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        for (;;) {
                struct poll_list *walk;
-                set_current_state(TASK_INTERRUPTIBLE);
                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                        to = &expire;
                }
-                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                        timed_out = 1;
        }
-        __set_current_state(TASK_RUNNING);
        return count;
 }
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..7e12a6f82795 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
                struct inode *inode = path.dentry->d_inode;
                error = -EINVAL;
-                if (inode->i_op && inode->i_op->readlink) {
+                if (inode->i_op->readlink) {
                        error = security_inode_readlink(path.dentry);
                        if (!error) {
                                touch_atime(path.mnt, path.dentry);
diff --git a/fs/super.c b/fs/super.c
index d5fd4498548a..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
+                INIT_LIST_HEAD(&s->s_async_list);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
 {
        const struct super_operations *sop = sb->s_op;
        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
                fsync_super(sb);
                lock_super(sb);
                sb->s_flags &= ~MS_ACTIVE;
+                /*
+                 * wait for asynchronous fs operations to finish before going further
+                 */
+                async_synchronize_full_special(&sb->s_async_list);
                /* bad name - it should be evict_inodes() */
                invalidate_inodes(sb);
                lock_kernel();
@@ -461,6 +470,7 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
+                async_synchronize_full_special(&sb->s_async_list);
                if (sb->s_root && (wait || sb->s_dirt))
                        sb->s_op->sync_fs(sb, wait);
                up_read(&sb->s_umount);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..ac02b56548bc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
        return ret;
 }
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:               file to sync
+ * @dentry:             dentry of @file
+ * @data:               only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        int ret;
+        const struct file_operations *fop;
-        int err;
+        struct address_space *mapping;
-        struct address_space *mapping = file->f_mapping;
+        int err, ret;
+        /*
+         * Get mapping and operations from the file in case we have
+         * as file, or get the default values for them in case we
+         * don't have a struct file available.  Damn nfsd..
+         */
+        if (file) {
+                mapping = file->f_mapping;
+                fop = file->f_op;
+        } else {
+                mapping = dentry->d_inode->i_mapping;
+                fop = dentry->d_inode->i_fop;
+        }
-        if (!file->f_op || !file->f_op->fsync) {
+        if (!fop || !fop->fsync) {
-                /* Why?  We can still call filemap_fdatawrite */
                ret = -EINVAL;
                goto out;
        }
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+        err = fop->fsync(file, dentry, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
 out:
        return ret;
 }
+EXPORT_SYMBOL(vfs_fsync);
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
 {
        struct file *file;
        int ret = -EBADF;
        file = fget(fd);
        if (file) {
-                ret = do_fsync(file, datasync);
+                ret = vfs_fsync(file, file->f_path.dentry, datasync);
                fput(file);
        }
        return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
 asmlinkage long sys_fsync(unsigned int fd)
 {
-        return __do_fsync(fd, 0);
+        return do_fsync(fd, 0);
 }
 asmlinkage long sys_fdatasync(unsigned int fd)
 {
-        return __do_fsync(fd, 1);
+        return do_fsync(fd, 1);
 }
 /*
@@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
        if (flags & SYNC_FILE_RANGE_WRITE) {
                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-                                                WB_SYNC_NONE);
+                                                WB_SYNC_ALL);
                if (ret < 0)
                        goto out;
        }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct bin_attribute *bin_attr;
-        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &sysfs_aops;
        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
        inode->i_op = &sysfs_inode_operations;
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
        depends on UBIFS_FS
        default y
        help
-          Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+          Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 0e5e54d82924..175f9c590b77 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c)
 *
 * This function is called when an operation cannot be budgeted because there
 * is supposedly no free space. But in most cases there is some free space:
- *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
 *     needed, so shrinking the liability is one way to make free space - the
 *     cached data will take less space then it was budgeted for;
 *   o GC may turn some dark space into free space (budgeting treats dark space
@@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 * @c: UBIFS file-system description object
 *
 * This function converts budget which was allocated for a new page of data to
- * the budget of changing an existing page of data. The latter is smaller then
+ * the budget of changing an existing page of data. The latter is smaller than
 * the former, so this function only does simple re-calculation and does not
 * involve any write-back.
 */
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..9832f9abe28e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
 #define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
 /*
- * GC may need to move more then one LEB to make progress. The below constants
+ * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
 */
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 10ae25b7d1db..9b7c54e0cd2a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
        if (wbuf->lnum != -1 && avail >= len) {
                /*
                 * Someone else has switched the journal head and we have
-                 * enough space now. This happens when more then one process is
+                 * enough space now. This happens when more than one process is
                 * trying to write to the same journal head at the same time.
                 */
                dbg_jnl("return LEB %d back, already have LEB %d:%d",
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
 * @contention: if any contention, this is set to %1
 *
 * This function walks the list of mounted UBIFS file-systems and frees clean
- * znodes which are older then @age, until at least @nr znodes are freed.
+ * znodes which are older than @age, until at least @nr znodes are freed.
 * Returns the number of freed znodes.
 */
 static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0d7564b95f8e..89556ee72518 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
        int i, err;
        struct ubifs_info *c = sb->s_fs_info;
        struct writeback_control wbc = {
-                .sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+                .sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start = 0,
                .range_end   = LLONG_MAX,
                .nr_to_write = LONG_MAX,
        };
+        /*
+         * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
+         * advisory thing to help the file system shove lots of data into the
+         * queues. If some gets missed then it'll be picked up on the second
+         * '->sync_fs()' call, with non-zero @wait.
+         */
        if (sb->s_flags & MS_RDONLY)
                return 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..237804cd6b56 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
        if (error)
                return error;
        error = -EOPNOTSUPP;
-        if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+        if (d->d_inode->i_op->listxattr) {
                error = d->d_inode->i_op->listxattr(d, list, size);
        } else {
                error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef2..be846d606ae8 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1348,7 +1348,7 @@ xfs_finish_flags(
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-        /* Fail a mount where the logbuf is smaller then the log stripe */
+        /* Fail a mount where the logbuf is smaller than the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
                if (mp->m_logbsize <= 0 &&
                    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {