Merge branch 'master' into next

Conflicts: fs/namei.c Manually merged per: diff --cc fs/namei.c index 734f2b5,bbc15c2..0000000 --- a/fs/namei.c +++ b/fs/namei.c @@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; + - error = ima_path_check(&nd->path, ++ error = ima_path_check(path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ Signed-off-by: James Morris <jmorris@namei.org>
author: James Morris <jmorris@namei.org> 2009-02-05 19:01:45 -0500
committer: James Morris <jmorris@namei.org> 2009-02-05 19:01:45 -0500
commit: cb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree: 7c06d8f30783115e3384721046258ce615b129c5 /fs/ocfs2
parent: 8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parent: f01d1d546abb2f4028b5299092f529eefb01253a (diff)
52 files changed, 8289 insertions, 2336 deletions
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 000000000000..701b7a3a872e
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,85 @@
+config OCFS2_FS
+        tristate "OCFS2 file system support"
+        depends on NET && SYSFS
+        select CONFIGFS_FS
+        select JBD2
+        select CRC32
+        select QUOTA
+        select QUOTA_TREE
+        help
+          OCFS2 is a general purpose extent based shared disk cluster file
+          system with many similarities to ext3. It supports 64 bit inode
+          numbers, and has automatically extending metadata groups which may
+          also make it attractive for non-clustered use.
+          You'll want to install the ocfs2-tools package in order to at least
+          get "mount.ocfs2".
+          Project web page:    http://oss.oracle.com/projects/ocfs2
+          Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+          OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+          For more information on OCFS2, see the file
+          <file:Documentation/filesystems/ocfs2.txt>.
+config OCFS2_FS_O2CB
+        tristate "O2CB Kernelspace Clustering"
+        depends on OCFS2_FS
+        default y
+        help
+          OCFS2 includes a simple kernelspace clustering package, the OCFS2
+          Cluster Base.  It only requires a very small userspace component
+          to configure it. This comes with the standard ocfs2-tools package.
+          O2CB is limited to maintaining a cluster for OCFS2 file systems.
+          It cannot manage any other cluster applications.
+          It is always safe to say Y here, as the clustering method is
+          run-time selectable.
+config OCFS2_FS_USERSPACE_CLUSTER
+        tristate "OCFS2 Userspace Clustering"
+        depends on OCFS2_FS && DLM
+        default y
+        help
+          This option will allow OCFS2 to use userspace clustering services
+          in conjunction with the DLM in fs/dlm.  If you are using a
+          userspace cluster manager, say Y here.
+          It is safe to say Y, as the clustering method is run-time
+          selectable.
+config OCFS2_FS_STATS
+        bool "OCFS2 statistics"
+        depends on OCFS2_FS
+        default y
+        help
+          This option allows some fs statistics to be captured. Enabling
+          this option may increase the memory consumption.
+config OCFS2_DEBUG_MASKLOG
+        bool "OCFS2 logging support"
+        depends on OCFS2_FS
+        default y
+        help
+          The ocfs2 filesystem has an extensive logging system.  The system
+          allows selection of events to log via files in /sys/o2cb/logmask/.
+          This option will enlarge your kernel, but it allows debugging of
+          ocfs2 filesystem issues.
+config OCFS2_DEBUG_FS
+        bool "OCFS2 expensive checks"
+        depends on OCFS2_FS
+        default n
+        help
+          This option will enable expensive consistency checks. Enable
+          this option for debugging only as it is likely to decrease
+          performance of the filesystem.
+config OCFS2_FS_POSIX_ACL
+        bool "OCFS2 POSIX Access Control Lists"
+        depends on OCFS2_FS
+        select FS_POSIX_ACL
+        default n
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
        aops.o                  \
+        blockcheck.o            \
        buffer_head_io.o        \
        dcache.o                \
        dir.o                   \
@@ -35,8 +36,14 @@ ocfs2-objs := \
        sysfile.o               \
        uptodate.o              \
        ver.o                   \
+        quota_local.o           \
+        quota_global.o          \
        xattr.o
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+        int n, count;
+        struct posix_acl *acl;
+        if (!value)
+                return NULL;
+        if (size < sizeof(struct posix_acl_entry))
+                return ERR_PTR(-EINVAL);
+        count = size / sizeof(struct posix_acl_entry);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_NOFS);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (n = 0; n < count; n++) {
+                struct ocfs2_acl_entry *entry =
+                        (struct ocfs2_acl_entry *)value;
+                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+                acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+                value += sizeof(struct posix_acl_entry);
+        }
+        return acl;
+}
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+        struct ocfs2_acl_entry *entry = NULL;
+        char *ocfs2_acl;
+        size_t n;
+        *size = acl->a_count * sizeof(struct posix_acl_entry);
+        ocfs2_acl = kmalloc(*size, GFP_NOFS);
+        if (!ocfs2_acl)
+                return ERR_PTR(-ENOMEM);
+        entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+        for (n = 0; n < acl->a_count; n++, entry++) {
+                entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+                entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+                entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+        }
+        return ocfs2_acl;
+}
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+                                              int type,
+                                              struct buffer_head *di_bh)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int name_index;
+        char *value = NULL;
+        struct posix_acl *acl;
+        int retval;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+        if (retval > 0) {
+                value = kmalloc(retval, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                                "", value, retval);
+        }
+        if (retval > 0)
+                acl = ocfs2_acl_from_xattr(value, retval);
+        else if (retval == -ENODATA || retval == 0)
+                acl = NULL;
+        else
+                acl = ERR_PTR(retval);
+        kfree(value);
+        return acl;
+}
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                acl = ERR_PTR(ret);
+                return acl;
+        }
+        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+        ocfs2_inode_unlock(inode, 0);
+        brelse(di_bh);
+        return acl;
+}
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+                         struct inode *inode,
+                         struct buffer_head *di_bh,
+                         int type,
+                         struct posix_acl *acl,
+                         struct ocfs2_alloc_context *meta_ac,
+                         struct ocfs2_alloc_context *data_ac)
+{
+        int name_index;
+        void *value = NULL;
+        size_t size = 0;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        ret = posix_acl_equiv_mode(acl, &mode);
+                        if (ret < 0)
+                                return ret;
+                        else {
+                                inode->i_mode = mode;
+                                if (ret == 0)
+                                        acl = NULL;
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                value = ocfs2_acl_to_xattr(acl, &size);
+                if (IS_ERR(value))
+                        return (int)PTR_ERR(value);
+        }
+        if (handle)
+                ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+                                             "", value, size, 0,
+                                             meta_ac, data_ac);
+        else
+                ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+        kfree(value);
+        return ret;
+}
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int ret = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return ret;
+        }
+        return -EAGAIN;
+}
+int ocfs2_acl_chmod(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl, *clone;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        ret = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!ret)
+                ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+                                    clone, NULL, NULL);
+        posix_acl_release(clone);
+        return ret;
+}
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+                   struct inode *inode,
+                   struct inode *dir,
+                   struct buffer_head *di_bh,
+                   struct buffer_head *dir_bh,
+                   struct ocfs2_alloc_context *meta_ac,
+                   struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl = NULL;
+        int ret = 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                        acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+                                                   dir_bh);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        ret = ocfs2_set_acl(handle, inode, di_bh,
+                                            ACL_TYPE_DEFAULT, acl,
+                                            meta_ac, data_ac);
+                        if (ret)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                ret = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                mode = inode->i_mode;
+                ret = posix_acl_create_masq(clone, &mode);
+                if (ret >= 0) {
+                        inode->i_mode = mode;
+                        if (ret > 0) {
+                                ret = ocfs2_set_acl(handle, inode,
+                                                    di_bh, ACL_TYPE_ACCESS,
+                                                    clone, meta_ac, data_ac);
+                        }
+                }
+                posix_acl_release(clone);
+        }
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+                                          char *list,
+                                          size_t list_len,
+                                          const char *name,
+                                          size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+        return size;
+}
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+                                           char *list,
+                                           size_t list_len,
+                                           const char *name,
+                                           size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+        return size;
+}
+static int ocfs2_xattr_get_acl(struct inode *inode,
+                               int type,
+                               void *buffer,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        acl = ocfs2_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        ret = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+                                       const char *name,
+                                       void *buffer,
+                                       size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+static int ocfs2_xattr_set_acl(struct inode *inode,
+                               int type,
+                               const void *value,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret = 0;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        ret = posix_acl_valid(acl);
+                        if (ret)
+                                goto cleanup;
+                }
+        } else
+                acl = NULL;
+        ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+                                      const char *name,
+                                      const void *value,
+                                      size_t size,
+                                      int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+                                       const char *name,
+                                       const void *value,
+                                       size_t size,
+                                       int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .list   = ocfs2_xattr_list_acl_access,
+        .get    = ocfs2_xattr_get_acl_access,
+        .set    = ocfs2_xattr_set_acl_access,
+};
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .list   = ocfs2_xattr_list_acl_default,
+        .get    = ocfs2_xattr_get_acl_default,
+        .set    = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+#include <linux/posix_acl_xattr.h>
+struct ocfs2_acl_entry {
+        __le16 e_tag;
+        __le16 e_perm;
+        __le32 e_id;
+};
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+                          struct buffer_head *, struct buffer_head *,
+                          struct ocfs2_alloc_context *,
+                          struct ocfs2_alloc_context *);
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+                                 struct inode *inode,
+                                 struct inode *dir,
+                                 struct buffer_head *di_bh,
+                                 struct buffer_head *dir_bh,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct ocfs2_alloc_context *data_ac)
+{
+        return 0;
+}
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..60fe74035db5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -36,6 +37,7 @@
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -46,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
                                     struct ocfs2_extent_tree *et)
 {
-        int ret = 0;
+        struct ocfs2_dinode *di = et->et_object;
-        struct ocfs2_dinode *di;
        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
-        di = et->et_object;
+        return 0;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                ret = -EIO;
-                ocfs2_error(inode->i_sb,
-                        "Inode %llu has invalid path root",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        }
-        return ret;
 }
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv = et->et_object;
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-        et->et_root_el = &xv->xr_list;
+        et->et_root_el = &vb->vb_xv->xr_list;
 }
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                              u64 blkno)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        xv->xr_last_eb_blk = cpu_to_le64(blkno);
+        vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *) et->et_object;
-        return le64_to_cpu(xv->xr_last_eb_blk);
+        return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
                                              struct ocfs2_extent_tree *et,
                                              u32 clusters)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        le32_add_cpu(&xv->xr_clusters, clusters);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
+                                     ocfs2_journal_access_func access,
                                     void *obj,
                                     struct ocfs2_extent_tree_operations *ops)
 {
        et->et_ops = ops;
        et->et_root_bh = bh;
+        et->et_root_journal_access = access;
        if (!obj)
                obj = (void *)bh->b_data;
        et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
                                   struct inode *inode,
                                   struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+                                 NULL, &ocfs2_dinode_et_ops);
 }
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL,
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
-                                 &ocfs2_xattr_tree_et_ops);
+                                 NULL, &ocfs2_xattr_tree_et_ops);
 }
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb)
-                                        struct ocfs2_xattr_value_root *xv)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, xv,
+        __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
                                 &ocfs2_xattr_value_et_ops);
 }
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
        et->et_ops->eo_update_clusters(inode, et, clusters);
 }
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+                                               struct inode *inode,
+                                               struct ocfs2_extent_tree *et,
+                                               int type)
+{
+        return et->et_root_journal_access(handle, inode, et->et_root_bh,
+                                          type);
+}
 static inline int ocfs2_et_insert_check(struct inode *inode,
                                        struct ocfs2_extent_tree *et,
                                        struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH    5
 struct ocfs2_path {
-        int                     p_tree_depth;
+        int                             p_tree_depth;
-        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
+        ocfs2_journal_access_func       p_root_access;
+        struct ocfs2_path_item          p_node[OCFS2_MAX_PATH_DEPTH];
 };
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
         */
        if (keep_root)
                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        else
+                path_root_access(path) = NULL;
        path->p_tree_depth = depth;
 }
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
        BUG_ON(path_root_el(dest) != path_root_el(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        ocfs2_reinit_path(dest, 1);
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        int i;
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
                brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-                                         struct ocfs2_extent_list *root_el)
+                                         struct ocfs2_extent_list *root_el,
+                                         ocfs2_journal_access_func access)
 {
        struct ocfs2_path *path;
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
                get_bh(root_bh);
                path_root_bh(path) = root_bh;
                path_root_el(path) = root_el;
+                path_root_access(path) = access;
        }
        return path;
 }
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+        return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+                              path_root_access(path));
+}
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+        return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+                              et->et_root_journal_access);
+}
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+                                        struct inode *inode,
+                                        struct ocfs2_path *path,
+                                        int idx)
+{
+        ocfs2_journal_access_func access = path_root_access(path);
+        if (!access)
+                access = ocfs2_journal_access;
+        if (idx)
+                access = ocfs2_journal_access_eb;
+        return access(handle, inode, path->p_node[idx].bh,
+                      OCFS2_JOURNAL_ACCESS_WRITE);
+}
 /*
 * Convenience function to journal all components in a path.
 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
                goto out;
        for(i = 0; i < path_num_items(path); i++) {
-                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
        int                     c_split_covers_rec;
 };
+static int ocfs2_validate_extent_block(struct super_block *sb,
+                                       struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_extent_block *eb =
+                (struct ocfs2_extent_block *)bh->b_data;
+        mlog(0, "Validating extent block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return rc;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has bad signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            eb->h_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid h_blkno "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(eb->h_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid "
+                            "h_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(eb->h_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+                              ocfs2_validate_extent_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /*
 * How many free extents have we got before we need more meta data?
 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
        if (last_eb_blk) {
-                retval = ocfs2_read_block(inode, last_eb_blk,
+                retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
-                                          &eb_bh);
                if (retval < 0) {
                        mlog_errno(retval);
                        goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        }
                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-                        status = ocfs2_journal_access(handle, inode, bhs[i],
+                        status = ocfs2_journal_access_eb(handle, inode, bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                                         OCFS2_JOURNAL_ACCESS_CREATE);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        for(i = 0; i < new_blocks; i++) {
                bh = new_eb_bhs[i];
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                /* ocfs2_create_new_meta_bhs() should create it right! */
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                        status = -EIO;
-                        goto bail;
-                }
                eb_el = &eb->h_list;
-                status = ocfs2_journal_access(handle, inode, bh,
+                status = ocfs2_journal_access_eb(handle, inode, bh,
-                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                                                 OCFS2_JOURNAL_ACCESS_CREATE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        if (eb_bh) {
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        }
        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        /* ocfs2_create_new_meta_bhs() should create it right! */
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                status = -EIO;
-                goto bail;
-        }
        eb_el = &eb->h_list;
        root_el = et->et_root_el;
-        status = ocfs2_journal_access(handle, inode, new_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                brelse(bh);
                bh = NULL;
-                status = ocfs2_read_block(inode, blkno, &bh);
+                status = ocfs2_read_extent_block(inode, blkno, &bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        status = -EIO;
-                        goto bail;
-                }
                el = &eb->h_list;
                if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
                brelse(bh);
                bh = NULL;
-                ret = ocfs2_read_block(inode, blkno, &bh);
+                ret = ocfs2_read_extent_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out;
-                }
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
        root_bh = left_path->p_node[subtree_index].bh;
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
        *ret_left_path = NULL;
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                        return -EAGAIN;
                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_journal_access_eb(handle, inode,
-                                                   path_leaf_bh(right_path),
+                                                      path_leaf_bh(right_path),
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                 * We have to update i_last_eb_blk during the meta
                 * data delete.
                 */
-                ret = ocfs2_journal_access(handle, inode, et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
         */
        BUG_ON(right_has_empty && !del_right_subtree);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2596,16 +2694,17 @@ out:
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head *bh,
+                                            struct ocfs2_path *path)
-                                            struct ocfs2_extent_list *el)
 {
        int ret;
+        struct buffer_head *bh = path_leaf_bh(path);
+        struct ocfs2_extent_list *el = path_leaf_el(path);
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
                return 0;
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                goto out;
        }
-        left_path = ocfs2_new_path(path_root_bh(path),
+        left_path = ocfs2_new_path_from_path(path);
-                                   path_root_el(path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
        ocfs2_cp_path(left_path, path);
-        right_path = ocfs2_new_path(path_root_bh(path),
+        right_path = ocfs2_new_path_from_path(path);
-                                    path_root_el(path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                 * Caller might still want to make changes to the
                 * tree root, so re-add it to the journal here.
                 */
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           path_root_bh(left_path),
+                                                   left_path, 0);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                 * We have a path to the left of this one - it needs
                 * an update too.
                 */
-                left_path = ocfs2_new_path(path_root_bh(path),
+                left_path = ocfs2_new_path_from_path(path);
-                                           path_root_el(path));
                if (!left_path) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
                 * it up front.
                 */
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-                                                       path_leaf_bh(path),
+                                                       path);
-                                                       path_leaf_el(path));
                if (ret)
                        mlog_errno(ret);
                goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
        /* This function shouldn't be called for the rightmost leaf. */
        BUG_ON(right_cpos == 0);
-        right_path = ocfs2_new_path(path_root_bh(left_path),
+        right_path = ocfs2_new_path_from_path(left_path);
-                                    path_root_el(left_path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                right_rec = &el->l_recs[index + 1];
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(left_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
        /* This function shouldn't be called for the leftmost leaf. */
        BUG_ON(left_cpos == 0);
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                        has_empty_extent = 1;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(right_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                 * leftmost leaf.
                 */
                if (left_cpos) {
-                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                        left_path = ocfs2_new_path_from_path(right_path);
-                                                   path_root_el(right_path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
        struct ocfs2_extent_rec *rec, *tmprec;
-        right_el = path_leaf_el(right_path);;
+        right_el = path_leaf_el(right_path);
        if (left_path)
                left_el = path_leaf_el(left_path);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        el = et->et_root_el;
-        ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                goto out_update_clusters;
        }
-        right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        right_path = ocfs2_new_path_from_et(et);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                 * ocfs2_rotate_tree_right() might have extended the
                 * transaction without re-journaling our tree root.
                 */
-                ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        goto out;
                if (left_cpos != 0) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path)
                                goto out;
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                            le16_to_cpu(new_el->l_count)) {
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of "
+                                            "%d.  It should have "
+                                            "matched the l_count of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec),
+                                            le16_to_cpu(new_el->l_count));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                if (right_cpos == 0)
                        goto out;
-                right_path = ocfs2_new_path(path_root_bh(path),
+                right_path = ocfs2_new_path_from_path(path);
-                                            path_root_el(path));
                if (!right_path)
                        goto out;
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
                 * may want it later.
                 */
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+                ret = ocfs2_read_extent_block(inode,
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                return 0;
        }
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
        BUG_ON(num_bits > clusters_to_add);
-        /* reserve our write early -- insert_extent may update the inode */
+        /* reserve our write early -- insert_extent may update the tree root */
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        if (path->p_tree_depth) {
                struct ocfs2_extent_block *eb;
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EROFS;
-                        goto out;
-                }
                rightmost_el = &eb->h_list;
        } else
                rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
        if (et->et_ops == &ocfs2_dinode_et_ops)
                ocfs2_extent_map_trunc(inode, 0);
-        left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        left_path = ocfs2_new_path_from_et(et);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
        depth = path->p_tree_depth;
        if (depth > 0) {
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
                }
                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
        ocfs2_extent_map_trunc(inode, 0);
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -5255,6 +5348,81 @@ out:
        return ret;
 }
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        vfs_dq_free_space_nodirty(inode,
+                                  ocfs2_clusters_to_bytes(inode->i_sb, len));
+        ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_et_update_clusters(inode, et, -len);
+        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5476,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        tl_count = le16_to_cpu(tl->tl_count);
        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
                        tl_count == 0,
@@ -5332,8 +5500,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+        status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -5394,8 +5562,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        while (i >= 0) {
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
-                status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+                status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -5464,13 +5632,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto out;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        num_to_flush = le16_to_cpu(tl->tl_used);
        mlog(0, "Flush %u records from truncate log #%llu\n",
             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5754,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                iput(inode);
                mlog_errno(status);
@@ -5625,13 +5793,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        }
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+         * validated by the underlying call to ocfs2_read_inode_block(),
+         * so any corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        if (le16_to_cpu(tl->tl_used)) {
                mlog(0, "We'll have %u logs to recover\n",
                     le16_to_cpu(tl->tl_used));
@@ -5651,6 +5819,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                 * tl_used. */
                tl->tl_used = 0;
+                ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
                status = ocfs2_write_block(osb, tl_bh, tl_inode);
                if (status < 0) {
                        mlog_errno(status);
@@ -5800,7 +5969,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
@@ -5815,10 +5987,10 @@ struct ocfs2_per_slot_free_list {
        struct ocfs2_cached_block_free          *f_first;
 };
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
-                                   int sysfile_type,
+                                    int sysfile_type,
-                                   int slot,
+                                    int slot,
-                                   struct ocfs2_cached_block_free *head)
+                                    struct ocfs2_cached_block_free *head)
 {
        int ret;
        u64 bg_blkno;
@@ -5893,6 +6065,82 @@ out:
        return ret;
 }
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit)
+{
+        int ret = 0;
+        struct ocfs2_cached_block_free *item;
+        item = kmalloc(sizeof(*item), GFP_NOFS);
+        if (item == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+             bit, (unsigned long long)blkno);
+        item->free_blk = blkno;
+        item->free_bit = bit;
+        item->free_next = ctxt->c_global_allocator;
+        ctxt->c_global_allocator = item;
+        return ret;
+}
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+                                      struct ocfs2_cached_block_free *head)
+{
+        struct ocfs2_cached_block_free *tmp;
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        int ret = 0;
+        mutex_lock(&tl_inode->i_mutex);
+        while (head) {
+                if (ocfs2_truncate_log_needs_flush(osb)) {
+                        ret = __ocfs2_flush_truncate_log(osb);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                }
+                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+                                                head->free_bit);
+                ocfs2_commit_trans(osb, handle);
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        mutex_unlock(&tl_inode->i_mutex);
+        while (head) {
+                /* Premature exit may have left some dangling items. */
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+        return ret;
+}
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +6156,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                if (fl->f_first) {
                        mlog(0, "Free items: (type %u, slot %d)\n",
                             fl->f_inode_type, fl->f_slot);
-                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                        ret2 = ocfs2_free_cached_blocks(osb,
-                                                       fl->f_slot, fl->f_first);
+                                                        fl->f_inode_type,
+                                                        fl->f_slot,
+                                                        fl->f_first);
                        if (ret2)
                                mlog_errno(ret2);
                        if (!ret)
@@ -5920,6 +6170,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                kfree(fl);
        }
+        if (ctxt->c_global_allocator) {
+                ret2 = ocfs2_free_cached_clusters(osb,
+                                                  ctxt->c_global_allocator);
+                if (ret2)
+                        mlog_errno(ret2);
+                if (!ret)
+                        ret = ret2;
+                ctxt->c_global_allocator = NULL;
+        }
        return ret;
 }
@@ -6075,11 +6336,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
        eb = (struct ocfs2_extent_block *) bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-                ret = -EROFS;
+         * Any corruption is a code bug. */
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        }
        *new_last_eb = bh;
        get_bh(*new_last_eb);
@@ -6326,8 +6586,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
        if (last_eb_bh) {
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -6350,6 +6610,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                goto bail;
        }
+        vfs_dq_free_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
@@ -6436,11 +6698,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
                mlog_errno(ret);
        else if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle, page_buffers(page),
-                                        from, to, &partial,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -6663,6 +6920,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        struct page **pages = NULL;
        loff_t end = osb->s_clustersize;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        has_data = i_size_read(inode) ? 1 : 0;
@@ -6682,15 +6940,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_inline_to_extents_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out_unlock;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -6701,6 +6960,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                unsigned int page_end;
                u64 phys;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        ret = -EDQUOT;
+                        goto out_commit;
+                }
+                did_quota = 1;
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
@@ -6774,6 +7040,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        }
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                          ocfs2_clusters_to_bytes(osb->sb, 1));
        ocfs2_commit_trans(osb, handle);
 out_unlock:
@@ -6813,7 +7083,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
                mlog_errno(status);
@@ -6984,20 +7255,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+                status = ocfs2_read_extent_block(inode,
-                                          &last_eb_bh);
+                                                 le64_to_cpu(fe->i_last_eb_blk),
+                                                 &last_eb_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(last_eb_bh);
-                        status = -EIO;
-                        goto bail;
-                }
        }
        (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7317,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
 *
 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
 * ocfs2_extent_tree_operations abstract the normal operations we do for
 * the root of extent b-tree.
 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
        struct ocfs2_extent_tree_operations     *et_ops;
        struct buffer_head                      *et_root_bh;
        struct ocfs2_extent_list                *et_root_el;
+        ocfs2_journal_access_func               et_root_journal_access;
        void                                    *et_object;
        unsigned int                            et_max_leaf_clusters;
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb);
-                                        struct ocfs2_xattr_value_root *xv);
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh);
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
 */
 struct ocfs2_cached_dealloc_ctxt {
        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+        struct ocfs2_cached_block_free          *c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
        c->c_first_suballocator = NULL;
+        c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+        return c->c_global_allocator != NULL;
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                goto bail;
-        }
        if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
                                                    le32_to_cpu(fe->i_clusters))) {
                mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle,
-                                        page_buffers(page),
-                                        from, to, NULL,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
                tmppage = wc->w_pages[i];
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                ocfs2_commit_trans(osb, handle);
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        wc->w_handle = handle;
+        if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
        /*
         * We don't want this to fail in ocfs2_write_end(), so do it
         * here.
         */
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        if (data_ac)
@@ -1790,6 +1776,10 @@ success:
        *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
+out_quota:
+        if (clusters_to_alloc)
+                vfs_dq_free_space(inode,
+                          ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
        }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "blockcheck.h"
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+        unsigned int b, p = 0;
+        /*
+         * Data bits are 0-based, but we're talking code bits, which
+         * are 1-based.
+         */
+        b = i + 1;
+        /* Use the cache if it is there */
+        if (p_cache)
+                p = *p_cache;
+        b += p;
+        /*
+         * For every power of two below our bit number, bump our bit.
+         *
+         * We compare with (b + 1) because we have to compare with what b
+         * would be _if_ it were bumped up by the parity bit.  Capice?
+         *
+         * p is set above.
+         */
+        for (; (1 << p) < (b + 1); p++)
+                b++;
+        if (p_cache)
+                *p_cache = p;
+        return b;
+}
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+        unsigned int i, b, p = 0;
+        BUG_ON(!d);
+        /*
+         * b is the hamming code bit number.  Hamming code specifies a
+         * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+         * for the algorithm.
+         *
+         * The i++ in the for loop is so that the start offset passed
+         * to ocfs2_find_next_bit_set() is one greater than the previously
+         * found bit.
+         */
+        for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+        {
+                /*
+                 * i is the offset in this hunk, nr + i is the total bit
+                 * offset.
+                 */
+                b = calc_code_bit(nr + i, &p);
+                /*
+                 * Data bits in the resultant code are checked by
+                 * parity bits that are part of the bit number
+                 * representation.  Huh?
+                 *
+                 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+                 * In other words, the parity bit at position 2^k
+                 * checks bits in positions having bit k set in
+                 * their binary representation.  Conversely, for
+                 * instance, bit 13, i.e. 1101(2), is checked by
+                 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+                 * </wikipedia>
+                 *
+                 * Note that 'k' is the _code_ bit number.  'b' in
+                 * our loop.
+                 */
+                parity ^= b;
+        }
+        /* While the data buffer was treated as little endian, the
+         * return value is in host endian. */
+        return parity;
+}
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+        return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix)
+{
+        unsigned int i, b;
+        BUG_ON(!d);
+        /*
+         * If the bit to fix has an hweight of 1, it's a parity bit.  One
+         * busted parity bit is its own error.  Nothing to do here.
+         */
+        if (hweight32(fix) == 1)
+                return;
+        /*
+         * nr + d is the bit right past the data hunk we're looking at.
+         * If fix after that, nothing to do
+         */
+        if (fix >= calc_code_bit(nr + d, NULL))
+                return;
+        /*
+         * nr is the offset in the data hunk we're starting at.  Let's
+         * start b at the offset in the code buffer.  See hamming_encode()
+         * for a more detailed description of 'b'.
+         */
+        b = calc_code_bit(nr, NULL);
+        /* If the fix is before this hunk, nothing to do */
+        if (fix < b)
+                return;
+        for (i = 0; i < d; i++, b++)
+        {
+                /* Skip past parity bits */
+                while (hweight32(b) == 1)
+                        b++;
+                /*
+                 * i is the offset in this data hunk.
+                 * nr + i is the offset in the total data buffer.
+                 * b is the offset in the total code buffer.
+                 *
+                 * Thus, when b == fix, bit i in the current hunk needs
+                 * fixing.
+                 */
+                if (b == fix)
+                {
+                        if (ocfs2_test_bit(i, data))
+                                ocfs2_clear_bit(i, data);
+                        else
+                                ocfs2_set_bit(i, data);
+                        break;
+                }
+        }
+}
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                             unsigned int fix)
+{
+        ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        u32 crc;
+        u32 ecc;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        crc = crc32_le(~0, data, blocksize);
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+        /* And check the crc32 again */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i;
+        u32 crc, ecc;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i, rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc, fix;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return 0;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        for (i = 0, ecc = 0; i < nr; i++) {
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        fix = ecc ^ check.bc_ecc;
+        for (i = 0; i < nr; i++) {
+                /*
+                 * Try the fix against each buffer.  It will only affect
+                 * one of them.
+                 */
+                ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+                                  bhs[i]->b_size * 8 * i, fix);
+        }
+        /* And check the crc32 again */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+        return rc;
+}
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+        return rc;
+}
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+/*
+ * Hamming code functions
+ */
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+                         unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix);
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                                    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
 #include "buffer_head_io.h"
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+        BH_NeedsValidate = BH_JBDPrivateStart,
+};
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                      struct inode *inode)
 {
@@ -166,7 +178,9 @@ bail:
 }
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-                      struct buffer_head *bhs[], int flags)
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh))
 {
        int status = 0;
        int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
+                        if (validate)
+                                set_buffer_needs_validate(bh);
                        bh->b_end_io = end_buffer_read_sync;
                        submit_bh(READ, bh);
                        continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                                bhs[i] = NULL;
                                continue;
                        }
+                        if (buffer_needs_validate(bh)) {
+                                /* We never set NeedsValidate if the
+                                 * buffer was held by the journal, so
+                                 * that better not have changed */
+                                BUG_ON(buffer_jbd(bh));
+                                clear_buffer_needs_validate(bh);
+                                status = validate(inode->i_sb, bh);
+                                if (status) {
+                                        put_bh(bh);
+                                        bhs[i] = NULL;
+                                        continue;
+                                }
+                        }
                }
                /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
                             int uptodate);
-static inline int ocfs2_read_block(struct inode        *inode,
-                                   u64                  off,
-                                   struct buffer_head **bh);
 int ocfs2_write_block(struct ocfs2_super          *osb,
                      struct buffer_head  *bh,
                      struct inode        *inode);
-int ocfs2_read_blocks(struct inode        *inode,
-                      u64                  block,
-                      int                  nr,
-                      struct buffer_head  *bhs[],
-                      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                           unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh));
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh);
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-                                   struct buffer_head **bh)
+                                   struct buffer_head **bh,
+                                   int (*validate)(struct super_block *sb,
+                                                   struct buffer_head *bh))
 {
        int status = 0;
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+        status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 bail:
        return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
-                 * o2hb_do_disk_heartbeat so that we avoid more then
+                 * o2hb_do_disk_heartbeat so that we avoid more than
                 * hr_timeout_ms between disk writes. On busy systems
                 * this should result in a heartbeat which is less
                 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUORUM),
        define_mask(EXPORT),
        define_mask(XATTR),
+        define_mask(QUOTA),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_QUORUM       0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT       0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e88..e9d7c2038c0f 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
+#include "super.h"
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
        return ret;
 }
+static DEFINE_SPINLOCK(dentry_list_lock);
+/* We limit the number of dentry locks to drop in one go. We have
+ * this limit so that we don't starve other users of ocfs2_wq. */
+#define DL_INODE_DROP_COUNT 64
+/* Drop inode references from dentry locks */
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+                                               dentry_lock_work);
+        struct ocfs2_dentry_lock *dl;
+        int drop_count = DL_INODE_DROP_COUNT;
+        spin_lock(&dentry_list_lock);
+        while (osb->dentry_lock_list && drop_count--) {
+                dl = osb->dentry_lock_list;
+                osb->dentry_lock_list = dl->dl_next;
+                spin_unlock(&dentry_list_lock);
+                iput(dl->dl_inode);
+                kfree(dl);
+                spin_lock(&dentry_list_lock);
+        }
+        if (osb->dentry_lock_list)
+                queue_work(ocfs2_wq, &osb->dentry_lock_work);
+        spin_unlock(&dentry_list_lock);
+}
 /*
 * ocfs2_dentry_iput() and friends.
 *
@@ -318,16 +347,23 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
                                   struct ocfs2_dentry_lock *dl)
 {
-        iput(dl->dl_inode);
        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
        ocfs2_lock_res_free(&dl->dl_lockres);
-        kfree(dl);
+        /* We leave dropping of inode reference to ocfs2_wq as that can
+         * possibly lead to inode deletion which gets tricky */
+        spin_lock(&dentry_list_lock);
+        if (!osb->dentry_lock_list)
+                queue_work(ocfs2_wq, &osb->dentry_lock_work);
+        dl->dl_next = osb->dentry_lock_list;
+        osb->dentry_lock_list = dl;
+        spin_unlock(&dentry_list_lock);
 }
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl)
 {
-        int unlock = 0;
+        int unlock;
        BUG_ON(dl->dl_count == 0);
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d9883..d06e16c06640 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
 extern struct dentry_operations ocfs2_dentry_ops;
 struct ocfs2_dentry_lock {
+        /* Use count of dentry lock */
        unsigned int            dl_count;
-        u64                     dl_parent_blkno;
+        union {
+                /* Linked list of dentry locks to release */
+                struct ocfs2_dentry_lock *dl_next;
+                u64                     dl_parent_blkno;
+        };
        /*
         * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl);
+void ocfs2_drop_dl_inodes(struct work_struct *work);
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
                                      int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
-static struct buffer_head *ocfs2_bread(struct inode *inode,
+/*
-                                       int block, int *err, int reada)
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
-        struct buffer_head *bh = NULL;
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-        int tmperr;
+                return 0;
-        u64 p_blkno;
-        int readflags = 0;
-        if (reada)
+        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
-                readflags |= OCFS2_BH_READAHEAD;
+}
-        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
-            i_size_read(inode)) {
+{
-                BUG_ON(!reada);
+        return ocfs2_meta_ecc(osb);
-                return NULL;
+}
-        }
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+{
-                                             NULL);
+        return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+}
-        if (tmperr < 0) {
-                mlog_errno(tmperr);
-                goto fail;
-        }
-        tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
-        if (tmperr < 0)
-                goto fail;
-        tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data)
+{
+        char *p = data;
-        *err = 0;
+        p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        return bh;
+        return (struct ocfs2_dir_block_trailer *)p;
+}
-fail:
+/*
-        brelse(bh);
+ * XXX: This is executed once on every dirent. We should consider optimizing
-        bh = NULL;
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+                                  struct ocfs2_dir_entry *de,
+                                  unsigned long offset,
+                                  unsigned long blklen)
+{
+        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        *err = -EIO;
+        if (!ocfs2_dir_has_trailer(dir))
-        return NULL;
+                return 0;
+        if (offset != toff)
+                return 0;
+        return 1;
+}
+static void ocfs2_init_dir_trailer(struct inode *inode,
+                                   struct buffer_head *bh)
+{
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+        strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+        trailer->db_compat_rec_len =
+                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -250,6 +277,108 @@ out:
        return NULL;
 }
+static int ocfs2_validate_dir_block(struct super_block *sb,
+                                    struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(bh, sb);
+        /*
+         * We don't validate dirents here, that's handled
+         * in-place when the code walks them.
+         */
+        mlog(0, "Validating dirblock %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         *
+         * Note that we are safe to call this even if the directory
+         * doesn't have a trailer.  Filesystems without metaecc will do
+         * nothing, and filesystems with it will have one.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+        if (rc)
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        return rc;
+}
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+                                struct buffer_head **bh, int flags)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        struct ocfs2_dir_block_trailer *trailer;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+                                    ocfs2_validate_dir_block);
+        if (rc) {
+                mlog_errno(rc);
+                goto out;
+        }
+        /*
+         * We check the trailer here rather than in
+         * ocfs2_validate_dir_block() because that function doesn't have
+         * the inode to test.
+         */
+        if (!(flags & OCFS2_BH_READAHEAD) &&
+            ocfs2_dir_has_trailer(inode)) {
+                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Invalid dirblock #%llu: "
+                                    "signature = %.*s\n",
+                                    (unsigned long long)tmp->b_blocknr, 7,
+                                    trailer->db_signature);
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu has an invalid "
+                                    "db_blkno of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_parent_dinode) !=
+                    OCFS2_I(inode)->ip_blkno) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu on dinode "
+                                    "#%llu has an invalid parent_dinode "
+                                    "of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+        }
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc ? -EIO : 0;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
                                }
                                num++;
-                                bh = ocfs2_bread(dir, b++, &err, 1);
+                                bh = NULL;
+                                err = ocfs2_read_dir_block(dir, b++, &bh,
+                                                           OCFS2_BH_READAHEAD);
                                bh_use[ra_max] = bh;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
-                if (ocfs2_read_block(dir, block, &bh)) {
+                if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
                        /* read error, skip block & hope for the best.
-                         * ocfs2_read_block() has released the bh. */
+                         * ocfs2_read_dir_block() has released the bh. */
                        ocfs2_error(dir->i_sb, "reading directory %llu, "
                                    "offset %lu\n",
                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
                       struct inode *new_entry_inode)
 {
        int ret;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        /*
         * The same code works fine for both inline-data and extent
-         * based directories, so no need to split this up.
+         * based directories, so no need to split this up.  The only
+         * difference is the journal_access function.
         */
-        ret = ocfs2_journal_access(handle, dir, de_bh,
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                access = ocfs2_journal_access_di;
+        ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
        struct ocfs2_dir_entry *de, *pde;
        int i, status = -ENOENT;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                access = ocfs2_journal_access_di;
        i = 0;
        pde = NULL;
        de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        goto bail;
                }
                if (de == de_del)  {
-                        status = ocfs2_journal_access(handle, dir, bh,
+                        status = access(handle, dir, bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
                        if (status < 0) {
                                status = -EIO;
                                mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
                        goto bail;
                }
+                /* We're guaranteed that we should have space, so we
+                 * can't possibly have hit the trailer...right? */
+                mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+                                "Hit dir trailer trying to insert %.*s "
+                                "(namelen %d) into directory %llu.  "
+                                "offset is %lu, trailer offset is %d\n",
+                                namelen, name, namelen,
+                                (unsigned long long)parent_fe_bh->b_blocknr,
+                                offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
                        retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
                                goto bail;
                        }
-                        status = ocfs2_journal_access(handle, dir, insert_bh,
+                        if (insert_bh == parent_fe_bh)
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                status = ocfs2_journal_access_di(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                        else
+                                status = ocfs2_journal_access_db(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
                        retval = 0;
                        goto bail;
                }
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
        }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        struct ocfs2_inline_data *data;
        struct ocfs2_dir_entry *de;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        int i, stored;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
-        int err;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        while (!error && !stored && *f_pos < i_size_read(inode)) {
                blk = (*f_pos) >> sb->s_blocksize_bits;
-                bh = ocfs2_bread(inode, blk, &err, 0);
+                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
-                if (!bh) {
+                        /* Skip the corrupt dirblock and keep trying */
-                        mlog(ML_ERROR,
-                             "directory #%llu contains a hole at offset %lld\n",
-                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                             *f_pos);
                        *f_pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
-                                tmp = ocfs2_bread(inode, ++blk, &err, 1);
+                                tmp = NULL;
-                                brelse(tmp);
+                                if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+                                                          OCFS2_BH_READAHEAD))
+                                        brelse(tmp);
                        }
                        last_ra_blk = blk;
                        ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
                }
                offset = 0;
                brelse(bh);
+                bh = NULL;
        }
        stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
        return !priv.seen_other;
 }
-static void ocfs2_fill_initial_dirents(struct inode *inode,
+/*
-                                       struct inode *parent,
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
-                                       char *start, unsigned int size)
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+                                                          struct inode *parent,
+                                                          char *start,
+                                                          unsigned int size)
 {
        struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
        de->name_len = 2;
        strcpy(de->name, "..");
        ocfs2_set_de_type(de, S_IFDIR);
+        return de;
 }
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        struct ocfs2_inline_data *data = &di->id2.i_data;
        unsigned int size = le16_to_cpu(data->id_count);
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct ocfs2_alloc_context *data_ac)
 {
        int status;
+        unsigned int size = osb->sb->s_blocksize;
        struct buffer_head *new_bh = NULL;
+        struct ocfs2_dir_entry *de;
        mlog_entry_void();
+        if (ocfs2_supports_dir_trailer(osb))
+                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
                                     data_ac, NULL, &new_bh);
        if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        ocfs2_set_new_buffer_uptodate(inode, new_bh);
-        status = ocfs2_journal_access(handle, inode, new_bh,
+        status = ocfs2_journal_access_db(handle, inode, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
-        ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
+        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-                                   osb->sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(inode, new_bh);
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                                     data_ac);
 }
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     unsigned int new_size)
+                                     struct super_block *sb)
 {
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
-        unsigned int bytes = new_size - old_size;
+        unsigned int new_size = sb->s_blocksize;
+        unsigned int bytes;
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                new_size = ocfs2_dir_trailer_blk_off(sb);
+        bytes = new_size - old_size;
        limit = start + old_size;
        de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
                                   struct buffer_head **first_block_bh)
 {
-        int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
        u32 alloc, bit_off, len;
        struct super_block *sb = dir->i_sb;
+        int ret, credits = ocfs2_inline_to_extents_credits(sb);
        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_sem;
        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
-        ret = ocfs2_journal_access(handle, dir, dirdata_bh,
+        ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
+        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
-                                 sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(dir, dirdata_bh);
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * We let the later dirent insert modify c/mtime - to the user
         * the data hasn't changed.
         */
-        ret = ocfs2_journal_access(handle, dir, di_bh,
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        dirdata_bh = NULL;
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct buffer_head **new_bh)
 {
        int status;
-        int extend;
+        int extend, did_quota = 0;
        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        if (extend) {
                u32 offset = OCFS2_I(dir)->ip_clusters;
+                if (vfs_dq_alloc_space_nodirty(dir,
+                                        ocfs2_clusters_to_bytes(sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
                                              1, 0, parent_fe_bh, handle,
                                              data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        }
        status = 0;
 bail:
+        if (did_quota && status < 0)
+                vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
        mlog_exit(status);
        return status;
 }
@@ -1569,16 +1771,22 @@ do_extend:
        ocfs2_set_new_buffer_uptodate(dir, new_bh);
-        status = ocfs2_journal_access(handle, dir, new_bh,
+        status = ocfs2_journal_access_db(handle, dir, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, sb->s_blocksize);
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(sb->s_blocksize);
+        if (ocfs2_dir_has_trailer(dir)) {
+                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+                ocfs2_init_dir_trailer(dir, new_bh);
+        } else {
+                de->rec_len = cpu_to_le16(sb->s_blocksize);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int *blocks_wanted)
 {
        int ret;
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_dir_entry *de, *last_de = NULL;
        char *de_buf, *limit;
        unsigned long offset = 0;
-        unsigned int rec_len, new_rec_len;
+        unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+        /*
+         * This calculates how many free bytes we'd have in block zero, should
+         * this function force expansion to an extent tree.
+         */
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+        else
+                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
        de_buf = di->id2.i_data.id_data;
        limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                        ret = -EEXIST;
                        goto out;
                }
+                /*
+                 * No need to check for a trailing dirent record here as
+                 * they're not used for inline dirs.
+                 */
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * dirent can be found.
         */
        *blocks_wanted = 1;
-        new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+        new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
        if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
                *blocks_wanted = 2;
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        struct ocfs2_dir_entry *de;
        struct super_block *sb = dir->i_sb;
        int status;
+        int blocksize = dir->i_sb->s_blocksize;
-        bh = ocfs2_bread(dir, 0, &status, 0);
+        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (!bh) {
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                                status = -ENOSPC;
                                goto bail;
                        }
-                        bh = ocfs2_bread(dir,
+                        status = ocfs2_read_dir_block(dir,
-                                         offset >> sb->s_blocksize_bits,
+                                             offset >> sb->s_blocksize_bits,
-                                         &status,
+                                             &bh, 0);
-                                         0);
+                        if (status) {
-                        if (!bh) {
                                mlog_errno(status);
                                goto bail;
                        }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = -EEXIST;
                        goto bail;
                }
+                if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+                                           blocksize))
+                        goto next;
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = 0;
                        goto bail;
                }
+next:
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       struct buffer_head *fe_bh,
                       struct ocfs2_alloc_context *data_ac);
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data);
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        struct list_head *iter, *head=NULL;
        u64 cookie;
        u32 flags;
+        u8 node;
        if (!dlm_grab(dlm)) {
                dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        name = past->name;
        locklen = past->namelen;
-        cookie = be64_to_cpu(past->cookie);
+        cookie = past->cookie;
        flags = be32_to_cpu(past->flags);
+        node = past->node_idx;
        if (locklen > DLM_LOCKID_NAME_MAX) {
                ret = DLM_IVBUFLEN;
-                mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+                mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+                     "handler!\n", locklen);
                goto leave;
        }
        if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
             (LKM_PUT_LVB|LKM_GET_LVB)) {
-                mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+                mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+                     flags);
                ret = DLM_BADARGS;
                goto leave;
        }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        if (past->type != DLM_AST &&
            past->type != DLM_BAST) {
                mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-                     "name=%.*s\n", past->type, 
+                     "name=%.*s, node=%u\n", past->type,
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     locklen, name);
+                     locklen, name, node);
                ret = DLM_IVLOCKID;
                goto leave;
        }
        res = dlm_lookup_lockres(dlm, name, locklen);
        if (!res) {
-                mlog(0, "got %sast for unknown lockres! "
+                mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
-                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
+                     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
-                     past->type == DLM_AST ? "" : "b",
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     locklen, name, node);
-                     locklen, name, locklen);
                ret = DLM_IVLOCKID;
                goto leave;
        }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
-                mlog(0, "responding with DLM_RECOVERING!\n");
+                mlog(0, "Responding with DLM_RECOVERING!\n");
                ret = DLM_RECOVERING;
                goto unlock_out;
        }
        if (res->state & DLM_LOCK_RES_MIGRATING) {
-                mlog(0, "responding with DLM_MIGRATING!\n");
+                mlog(0, "Responding with DLM_MIGRATING!\n");
                ret = DLM_MIGRATING;
                goto unlock_out;
        }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        lock = NULL;
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
-        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
+        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
-             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
+             "node=%u\n", past->type == DLM_AST ? "" : "b",
-             dlm_get_lock_cookie_node(cookie),
+             dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-             dlm_get_lock_cookie_seq(cookie),
+             dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-             locklen, name, locklen);
+             locklen, name, node);
        ret = DLM_NORMAL;
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: adding to granted list... type=%d, "
+                mlog(0, "ast: Adding to granted list... type=%d, "
-                          "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
                dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 leave:
        if (res)
                dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
+        spinlock_t track_lock;
        char *name;
        u8 node_num;
        u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
+        struct dlm_ctxt *dlm;
        unsigned migration_pending:1;
        atomic_t asts_reserved;
        spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
        struct debug_lockres *dl = m->private;
        struct dlm_ctxt *dlm = dl->dl_ctxt;
+        struct dlm_lock_resource *oldres = dl->dl_res;
        struct dlm_lock_resource *res = NULL;
+        struct list_head *track_list;
-        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->track_lock);
+        if (oldres)
+                track_list = &oldres->tracking;
+        else
+                track_list = &dlm->tracking_list;
-        if (dl->dl_res) {
+        list_for_each_entry(res, track_list, tracking) {
-                list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                if (&res->tracking == &dlm->tracking_list)
-                        if (dl->dl_res) {
+                        res = NULL;
-                                dlm_lockres_put(dl->dl_res);
+                else
-                                dl->dl_res = NULL;
-                        }
-                        if (&res->tracking == &dlm->tracking_list) {
-                                mlog(0, "End of list found, %p\n", res);
-                                dl = NULL;
-                                break;
-                        }
                        dlm_lockres_get(res);
-                        dl->dl_res = res;
+                break;
-                        break;
-                }
-        } else {
-                if (!list_empty(&dlm->tracking_list)) {
-                        list_for_each_entry(res, &dlm->tracking_list, tracking)
-                                break;
-                        dlm_lockres_get(res);
-                        dl->dl_res = res;
-                } else
-                        dl = NULL;
        }
+        spin_unlock(&dlm->track_lock);
-        if (dl) {
+        if (oldres)
-                spin_lock(&dl->dl_res->spinlock);
+                dlm_lockres_put(oldres);
-                dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-                spin_unlock(&dl->dl_res->spinlock);
-        }
-        spin_unlock(&dlm->spinlock);
+        dl->dl_res = res;
+        if (res) {
+                spin_lock(&res->spinlock);
+                dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+                spin_unlock(&res->spinlock);
+        } else
+                dl = NULL;
+        /* passed to seq_show */
        return dl;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
+        spin_lock_init(&dlm->track_lock);
        INIT_LIST_HEAD(&dlm->list);
        INIT_LIST_HEAD(&dlm->dirty_list);
        INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
+        struct dlm_ctxt *dlm;
        res = container_of(kref, struct dlm_lock_resource, refs);
+        dlm = res->dlm;
        /* This should not happen -- all lockres' have a name
         * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        spin_lock(&dlm->track_lock);
        if (!list_empty(&res->tracking))
                list_del_init(&res->tracking);
        else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
                     res->lockname.len, res->lockname.name);
                dlm_print_one_lock_resource(res);
        }
+        spin_unlock(&dlm->track_lock);
+        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->migration_pending = 0;
        res->inflight_locks = 0;
+        /* put in dlm_lockres_release */
+        dlm_grab(dlm);
+        res->dlm = dlm;
        kref_init(&res->refs);
        /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
        if (tmpres) {
                int dropping_ref = 0;
+                spin_unlock(&dlm->spinlock);
                spin_lock(&tmpres->spinlock);
+                /* We wait for the other thread that is mastering the resource */
+                if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+                        __dlm_wait_on_lockres(tmpres);
+                        BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+                }
                if (tmpres->owner == dlm->node_num) {
                        BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
                        dlm_lockres_grab_inflight_ref(dlm, tmpres);
                } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
                        dropping_ref = 1;
                spin_unlock(&tmpres->spinlock);
-                spin_unlock(&dlm->spinlock);
                /* wait until done messaging the master, drop our ref to allow
                 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                  struct dlm_node_iter *iter)
 {
        struct dlm_migrate_request migrate;
-        int ret, status = 0;
+        int ret, skip, status = 0;
        int nodenum;
        memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                    nodenum == new_master)
                        continue;
+                /* We could race exit domain. If exited, skip. */
+                spin_lock(&dlm->spinlock);
+                skip = (!test_bit(nodenum, dlm->domain_map));
+                spin_unlock(&dlm->spinlock);
+                if (skip) {
+                        clear_bit(nodenum, iter->node_map);
+                        continue;
+                }
                ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
-                if (ret < 0)
+                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(0, "migrate_request returned %d!\n", ret);
-                else if (status < 0) {
+                        if (!dlm_is_host_down(ret)) {
+                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+                                BUG();
+                        }
+                        clear_bit(nodenum, iter->node_map);
+                        ret = 0;
+                } else if (status < 0) {
                        mlog(0, "migrate request (node %u) returned %d!\n",
                             nodenum, status);
                        ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                spin_lock(&res->spinlock);
                /* This ensures that clear refmap is sent after the set */
-                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+                __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+                                                  DLM_LOCK_RES_MIGRATING));
                spin_unlock(&res->spinlock);
                /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..206a2370876a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 /*
 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
                                     struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+        .set_lvb        = ocfs2_set_qinfo_lvb,
+        .get_osb        = ocfs2_get_qinfo_osb,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
        return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+        BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+        return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
        if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
        return OCFS2_SB(inode->i_sb);
 }
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+        return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
        struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
        lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info)
+{
+        ocfs2_lock_res_init_once(lockres);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+                              0, lockres->l_name);
+        ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+                                   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+                                   info);
+}
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
        mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+                mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        mlog_entry_void();
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_meta_lvb(0, lockres);
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
        } else {
                /* Boo, we have to go to disk. */
                /* read bh, cast, ocfs2_refresh_inode */
-                status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+                status = ocfs2_read_inode_block(inode, bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                fe = (struct ocfs2_dinode *) (*bh)->b_data;
                /* This is a good chance to make sure we're not
-                 * locking an invalid object.
+                 * locking an invalid object.  ocfs2_read_inode_block()
+                 * already checked that the inode block is sane.
                 *
                 * We bug on a stale inode here because we checked
                 * above whether it was wiped from disk. The wiping
                 * node provides a guarantee that we receive that
                 * message and can mark the inode before dropping any
                 * locks associated with it. */
-                if (!OCFS2_IS_VALID_DINODE(fe)) {
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                        status = -EIO;
-                        goto bail_refresh;
-                }
                mlog_bug_on_msg(inode->i_generation !=
                                le32_to_cpu(fe->i_generation),
                                "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
                return 0;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+        status = ocfs2_read_inode_block(inode, ret_bh);
        if (status < 0)
                mlog_errno(status);
@@ -2831,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
        case OCFS2_UNLOCK_CANCEL_CONVERT:
                mlog(0, "Cancel convert success for %s\n", lockres->l_name);
                lockres->l_action = OCFS2_AST_INVALID;
+                /* Downconvert thread may have requeued this lock, we
+                 * need to wake it. */
+                if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+                        ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
                break;
        case OCFS2_UNLOCK_DROP_LOCK:
                lockres->l_level = DLM_LOCK_IV;
@@ -2922,7 +2955,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+        mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3482,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_qinfo_lvb *lvb;
+        struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        mlog_entry_void();
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+        lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+        lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+        lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+        lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+        lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+        lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+        mlog_exit_void();
+}
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        mlog_entry_void();
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres, level);
+        mlog_exit_void();
+}
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        struct buffer_head *bh = NULL;
+        struct ocfs2_global_disk_dqinfo *gdinfo;
+        int status = 0;
+        if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+                info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+                info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+                oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+                oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+                oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        be32_to_cpu(lvb->lvb_free_entry);
+        } else {
+                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                gdinfo = (struct ocfs2_global_disk_dqinfo *)
+                                        (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+                info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+                info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+                oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+                oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+                oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        le32_to_cpu(gdinfo->dqi_free_entry);
+                brelse(bh);
+                ocfs2_track_lock_refresh(lockres);
+        }
+bail:
+        return status;
+}
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        int status = 0;
+        mlog_entry_void();
+        /* On RO devices, locking really isn't needed... */
+        if (ocfs2_is_hard_readonly(osb)) {
+                if (ex)
+                        status = -EROFS;
+                goto bail;
+        }
+        if (ocfs2_mount_local(osb))
+                goto bail;
+        status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        if (!ocfs2_should_refresh_lock_res(lockres))
+                goto bail;
+        /* OK, we have the lock but we need to refresh the quota info */
+        status = ocfs2_refresh_qinfo(oinfo);
+        if (status)
+                ocfs2_qinfo_unlock(oinfo, ex);
+        ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+        mlog_exit(status);
+        return status;
+}
 /*
 * This is the filesystem locking protocol.  It provides the lock handling
 * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
        __be32       lvb_reserved2;
 };
+#define OCFS2_QINFO_LVB_VERSION 1
+struct ocfs2_qinfo_lvb {
+        __u8    lvb_version;
+        __u8    lvb_reserved[3];
+        __be32  lvb_bgrace;
+        __be32  lvb_igrace;
+        __be32  lvb_syncms;
+        __be32  lvb_blocks;
+        __be32  lvb_free_blk;
+        __be32  lvb_free_entry;
+};
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+        ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                ret = -EROFS;
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                goto out;
-        }
        if (el->l_tree_depth) {
                ocfs2_error(inode->i_sb,
                            "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
                        goto no_more_extents;
-                ret = ocfs2_read_block(inode,
+                ret = ocfs2_read_extent_block(inode,
-                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                              le64_to_cpu(eb->h_next_leaf_blk),
-                                       &next_eb_bh);
+                                              &next_eb_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                        ret = -EROFS;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-                        goto out;
-                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
                el = &next_eb->h_list;
                i = ocfs2_search_for_hole_index(el, v_cluster);
        }
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
        if (ret == 0)
                goto out;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -819,3 +806,74 @@ out:
        return ret;
 }
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh))
+{
+        int rc = 0;
+        u64 p_block, p_count;
+        int i, count, done = 0;
+        mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+                   "flags = %x, validate = %p)\n",
+                   inode, (unsigned long long)v_block, nr, bhs, flags,
+                   validate);
+        if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+            i_size_read(inode)) {
+                BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+                goto out;
+        }
+        while (done < nr) {
+                down_read(&OCFS2_I(inode)->ip_alloc_sem);
+                rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+                                                 &p_block, &p_count, NULL);
+                up_read(&OCFS2_I(inode)->ip_alloc_sem);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!p_block) {
+                        rc = -EIO;
+                        mlog(ML_ERROR,
+                             "Inode #%llu contains a hole at offset %llu\n",
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             (unsigned long long)(v_block + done) <<
+                             inode->i_sb->s_blocksize_bits);
+                        break;
+                }
+                count = nr - done;
+                if (p_count < count)
+                        count = p_count;
+                /*
+                 * If the caller passed us bhs, they should have come
+                 * from a previous readahead call to this function.  Thus,
+                 * they should have the right b_blocknr.
+                 */
+                for (i = 0; i < count; i++) {
+                        if (!bhs[done + i])
+                                continue;
+                        BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+                }
+                rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+                                       flags, validate);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                done += count;
+        }
+out:
+        mlog_exit(rc);
+        return rc;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                             u32 *p_cluster, u32 *num_clusters,
                             struct ocfs2_extent_list *el);
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+                                        struct buffer_head **bh,
+                                        int (*validate)(struct super_block *sb,
+                                                        struct buffer_head *bh))
+{
+        int status = 0;
+        if (bh == NULL) {
+                printk("ocfs2: bh == NULL\n");
+                status = -EINVAL;
+                goto bail;
+        }
+        status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+bail:
+        return status;
+}
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -56,6 +57,8 @@
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
+#include "acl.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -303,9 +306,9 @@ bail:
        return status;
 }
-static int ocfs2_simple_size_update(struct inode *inode,
+int ocfs2_simple_size_update(struct inode *inode,
-                                    struct buffer_head *di_bh,
+                             struct buffer_head *di_bh,
-                                    u64 new_i_size)
+                             u64 new_i_size)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
                   (unsigned long long)new_i_size);
+        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
         */
        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto leave;
-        }
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
        }
 restarted_transaction:
+        if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+            clusters_to_add))) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota = 1;
        /* reserve a write to the file entry early on - that we if we
         * run out of credits in the allocation path, we can still
         * update i_size. */
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /* Release unused quota reservation */
+        vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+        did_quota = 0;
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (handle) {
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
+        int locked[MAXQUOTAS] = {0, 0};
+        int credits, qtype;
+        struct ocfs2_mem_dqinfo *oinfo;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-        if (IS_ERR(handle)) {
+            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                status = PTR_ERR(handle);
+                credits = OCFS2_INODE_UPDATE_CREDITS;
-                mlog_errno(status);
+                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
-                goto bail_unlock;
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                        oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                                ocfs2_calc_qdel_credits(sb, USRQUOTA);
+                        locked[USRQUOTA] = 1;
+                }
+                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                        oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+                                   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+                        locked[GRPQUOTA] = 1;
+                }
+                handle = ocfs2_start_trans(osb, credits);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
+                status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                if (status < 0)
+                        goto bail_commit;
+        } else {
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
        }
        /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+                if (!locked[qtype])
+                        continue;
+                oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+                ocfs2_unlock_global_qf(oinfo, 1);
+        }
        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
 bail:
        brelse(bh);
+        if (!status && attr->ia_valid & ATTR_MODE) {
+                status = ocfs2_acl_chmod(inode);
+                if (status < 0)
+                        mlog_errno(status);
+        }
        mlog_exit(status);
        return status;
 }
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
        struct buffer_head *bh = NULL;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+        ret = ocfs2_read_inode_block(inode, &bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
        struct buffer_head *di_bh = NULL;
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+                ret = ocfs2_read_inode_block(inode, &di_bh);
-                                       &di_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1226,83 +1284,6 @@ out:
        return ret;
 }
-static int __ocfs2_remove_inode_range(struct inode *inode,
-                                      struct buffer_head *di_bh,
-                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-        int ret;
-        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_extent_tree et;
-        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        OCFS2_I(inode)->ip_clusters -= len;
-        di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-        if (ret)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        return ret;
-}
 /*
 * Truncate a byte range, avoiding pages within partial clusters. This
 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
        if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                /* Only do work for non-holes */
                if (phys_cpos != 0) {
-                        ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
-                                                         phys_cpos, alloc_size,
+                                                       phys_cpos, alloc_size,
-                                                         &dealloc);
+                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
                            struct ocfs2_space_resv *sr)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
            !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+                             struct buffer_head *di_bh,
+                             u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
                          u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include <asm/byteorder.h>
@@ -37,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        return 0;
 }
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino)
+                          int create_ino)
 {
        struct super_block *sb;
        struct ocfs2_super *osb;
-        int status = -EINVAL;
        int use_plocks = 1;
        mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
            ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
                use_plocks = 0;
-        /* this means that read_inode cannot create a superblock inode
+        /*
-         * today.  change if needed. */
+         * These have all been checked by ocfs2_read_inode_block() or set
-        if (!OCFS2_IS_VALID_DINODE(fe) ||
+         * by ocfs2_mknod_locked(), so a failure is a code bug.
-            !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+         */
-                mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
-                     "signature = %.*s, flags = 0x%x\n",
+                                                cannot create a superblock
-                     inode->i_ino,
+                                                inode today.  change if
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                                                that is needed. */
-                     fe->i_signature, le32_to_cpu(fe->i_flags));
+        BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
-                goto bail;
+        BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
-        }
-        if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-                mlog(ML_ERROR, "file entry generation does not match "
-                     "superblock! osb->fs_generation=%x, "
-                     "fe->i_fs_generation=%x\n",
-                     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-                goto bail;
-        }
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
-        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+                inode->i_flags |= S_NOQUOTA;
+        }
        if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
                mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
        } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+        } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+                inode->i_flags |= S_NOQUOTA;
        } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
                mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
                /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
-        status = 0;
+        mlog_exit_void();
-bail:
-        mlog_exit(status);
-        return status;
 }
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
-        if (can_lock)
+        if (can_lock) {
-                status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+                status = ocfs2_read_inode_block_full(inode, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                                     OCFS2_BH_IGNORE_CACHE);
-        else
+        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+                if (!status)
+                        status = ocfs2_validate_inode_block(osb->sb, bh);
+        }
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)args->fi_blkno, 7,
-                     fe->i_signature);
-                goto bail;
-        }
        /*
         * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
            S_ISBLK(le16_to_cpu(fe->i_mode)))
-                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
-        if (ocfs2_populate_inode(inode, fe, 0) < 0)
+        ocfs2_populate_inode(inode, fe, 0);
-                goto bail;
        BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                        goto out;
                }
-                status = ocfs2_journal_access(handle, inode, fe_bh,
+                status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+                                        ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        /* set the inodes dtime */
-        status = ocfs2_journal_access(handle, inode, di_bh,
+        status = ocfs2_journal_access_di(handle, inode, di_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        ocfs2_remove_from_cache(inode, di_bh);
+        vfs_dq_free_inode(inode);
        status = ocfs2_free_dinode(handle, inode_alloc_inode,
                                   inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        if (is_bad_inode(inode)) {
+        /* When we fail in read_inode() we mark inode as bad. The second test
+         * catches the case when inode allocation fails before allocating
+         * a block for inode. */
+        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
                mlog(0, "Skipping delete of bad inode\n");
                goto bail;
        }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        mlog_entry("(inode %llu)\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        mlog(0, "Validating dinode %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        rc = -EINVAL;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            di->i_signature);
+                goto bail;
+        }
+        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                goto bail;
+        }
+        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+                            (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        if (le32_to_cpu(di->i_fs_generation) !=
+            OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: fs_generation is %u\n",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(di->i_fs_generation));
+                goto bail;
+        }
+        rc = 0;
+bail:
+        return rc;
+}
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+                               flags, ocfs2_validate_inode_block);
+        /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+        return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino);
+                          int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
                           struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+                                int block, int *err, int reada);
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -45,6 +46,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num);
+                              int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 0);
+}
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 1);
+}
 /*
 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
        BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
        BUG_ON(max_buffs <= 0);
-        /* JBD might support this, but our journalling code doesn't yet. */
+        /* Nested transaction? Just return the handle... */
-        if (journal_current_handle()) {
+        if (journal_current_handle())
-                mlog(ML_ERROR, "Recursive transaction attempted!\n");
+                return jbd2_journal_start(journal, max_buffs);
-                BUG();
-        }
        down_read(&osb->journal->j_trans_barrier);
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
                       handle_t *handle)
 {
-        int ret;
+        int ret, nested;
        struct ocfs2_journal *journal = osb->journal;
        BUG_ON(!handle);
+        nested = handle->h_ref > 1;
        ret = jbd2_journal_stop(handle);
        if (ret < 0)
                mlog_errno(ret);
-        up_read(&journal->j_trans_barrier);
+        if (!nested)
+                up_read(&journal->j_trans_barrier);
        return ret;
 }
@@ -357,10 +370,137 @@ bail:
        return status;
 }
-int ocfs2_journal_access(handle_t *handle,
+struct ocfs2_triggers {
-                         struct inode *inode,
+        struct jbd2_buffer_trigger_type ot_triggers;
-                         struct buffer_head *bh,
+        int                             ot_offset;
-                         int type)
+};
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_dir_trailer_from_size(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                struct buffer_head *bh)
+{
+        mlog(ML_ERROR,
+             "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+             "bh->b_blocknr = %llu\n",
+             (unsigned long)bh,
+             (unsigned long long)bh->b_blocknr);
+        /* We aren't guaranteed to have the superblock here - but if we
+         * don't, it'll just crash. */
+        ocfs2_error(bh->b_assoc_map->host->i_sb,
+                    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+static struct ocfs2_triggers di_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
+};
+static struct ocfs2_triggers eb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
+};
+static struct ocfs2_triggers gd_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
+};
+static struct ocfs2_triggers db_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_db_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static struct ocfs2_triggers xb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+static struct ocfs2_triggers dq_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_dq_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static int __ocfs2_journal_access(handle_t *handle,
+                                  struct inode *inode,
+                                  struct buffer_head *bh,
+                                  struct ocfs2_triggers *triggers,
+                                  int type)
 {
        int status;
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
                status = -EINVAL;
                mlog(ML_ERROR, "Uknown access type!\n");
        }
+        if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
        if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
        return status;
 }
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+                               struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+                                      type);
+}
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+                                      type);
+}
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+                                      type);
+}
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+                                      type);
+}
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
 int ocfs2_journal_dirty(handle_t *handle,
                        struct buffer_head *bh)
 {
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
        return status;
 }
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-                             struct buffer_head *bh)
-{
-        int err = journal_dirty_data(handle, bh);
-        if (err)
-                mlog_errno(err);
-        /* TODO: When we can handle it, abort the handle and go RO on
-         * error here. */
-        return err;
-}
-#endif
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        mlog_entry_void();
        fe = (struct ocfs2_dinode *)bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                /* This is called from startup/shutdown which will
+        /* The journal bh on the osb always comes from ocfs2_journal_init()
-                 * handle the errors in a specific manner, so no need
+         * and was validated there inside ocfs2_inode_lock_full().  It's a
-                 * to call ocfs2_error() here. */
+         * code bug if we mess it up. */
-                mlog(ML_ERROR, "Journal dinode %llu  has invalid "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-                     "signature: %.*s",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                status = -EIO;
-                goto out;
-        }
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
        if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        if (replayed)
                ocfs2_bump_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
-out:
        mlog_exit(status);
        return status;
 }
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
        int                     lri_slot;
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
+        struct ocfs2_quota_recovery *lri_qrec;
 };
 /* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_super *osb = journal->j_osb;
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
+        struct ocfs2_quota_recovery *qrec;
        LIST_HEAD(tmp_la_list);
        mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+                ocfs2_wait_on_quotas(osb);
                la_dinode = item->lri_la_dinode;
                if (la_dinode) {
                        mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
                if (ret < 0)
                        mlog_errno(ret);
+                qrec = item->lri_qrec;
+                if (qrec) {
+                        mlog(0, "Recovering quota files");
+                        ret = ocfs2_finish_quota_recovery(osb, qrec,
+                                                          item->lri_slot);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                        /* Recovery info is already freed now */
+                }
                kfree(item);
        }
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
-                                            struct ocfs2_dinode *tl_dinode)
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec)
 {
        struct ocfs2_la_recovery_item *item;
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                if (tl_dinode)
                        kfree(tl_dinode);
+                if (qrec)
+                        ocfs2_free_quota_recovery(qrec);
                mlog_errno(-ENOMEM);
                return;
        }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_la_dinode = la_dinode;
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
+        item->lri_qrec = qrec;
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
                ocfs2_queue_recovery_completion(journal,
                                                osb->slot_num,
                                                osb->local_alloc_copy,
+                                                NULL,
                                                NULL);
                ocfs2_schedule_truncate_log_flush(osb, 0);
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        }
 }
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+        if (osb->quota_rec) {
+                ocfs2_queue_recovery_completion(osb->journal,
+                                                osb->slot_num,
+                                                NULL,
+                                                NULL,
+                                                osb->quota_rec);
+                osb->quota_rec = NULL;
+        }
+}
 static int __ocfs2_recovery_thread(void *arg)
 {
-        int status, node_num;
+        int status, node_num, slot_num;
        struct ocfs2_super *osb = arg;
        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        int *rm_quota = NULL;
+        int rm_quota_used = 0, i;
+        struct ocfs2_quota_recovery *qrec;
        mlog_entry_void();
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
                goto bail;
        }
+        rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+        if (!rm_quota) {
+                status = -ENOMEM;
+                goto bail;
+        }
 restart:
        status = ocfs2_super_lock(osb, 1);
        if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
                 * clear it until ocfs2_recover_node() has succeeded. */
                node_num = rm->rm_entries[0];
                spin_unlock(&osb->osb_lock);
+                mlog(0, "checking node %d\n", node_num);
-                status = ocfs2_recover_node(osb, node_num);
+                slot_num = ocfs2_node_num_to_slot(osb, node_num);
+                if (slot_num == -ENOENT) {
+                        status = 0;
+                        mlog(0, "no slot for this node, so no recovery"
+                             "required.\n");
+                        goto skip_recovery;
+                }
+                mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+                /* It is a bit subtle with quota recovery. We cannot do it
+                 * immediately because we have to obtain cluster locks from
+                 * quota files and we also don't want to just skip it because
+                 * then quota usage would be out of sync until some node takes
+                 * the slot. So we remember which nodes need quota recovery
+                 * and when everything else is done, we recover quotas. */
+                for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+                if (i == rm_quota_used)
+                        rm_quota[rm_quota_used++] = slot_num;
+                status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
                if (!status) {
                        ocfs2_recovery_map_clear(osb, node_num);
                } else {
@@ -1055,13 +1285,27 @@ restart:
        if (status < 0)
                mlog_errno(status);
+        /* Now it is right time to recover quotas... We have to do this under
+         * superblock lock so that noone can start using the slot (and crash)
+         * before we recover it */
+        for (i = 0; i < rm_quota_used; i++) {
+                qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+                if (IS_ERR(qrec)) {
+                        status = PTR_ERR(qrec);
+                        mlog_errno(status);
+                        continue;
+                }
+                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+                                                NULL, NULL, qrec);
+        }
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
         * node(s) may have disallowd a previos inode delete. Re-processing
         * is therefore required. */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL);
+                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
        mutex_unlock(&osb->recovery_lock);
+        if (rm_quota)
+                kfree(rm_quota);
        mlog_exit(status);
        /* no one is callint kthread_stop() for us so the kthread() api
         * requires that we call do_exit().  And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+        status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
-                                   OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        osb->slot_recovery_generations[slot_num] =
                                        ocfs2_get_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
 * far less concerning.
 */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num)
+                              int node_num, int slot_num)
 {
        int status = 0;
-        int slot_num;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
-        mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+        mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-                   node_num, osb->node_num);
+                   node_num, slot_num, osb->node_num);
-        mlog(0, "checking node %d\n", node_num);
        /* Should not ever be called to recover ourselves -- in that
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
-        slot_num = ocfs2_node_num_to_slot(osb, node_num);
-        if (slot_num == -ENOENT) {
-                status = 0;
-                mlog(0, "no slot for this node, so no recovery required.\n");
-                goto done;
-        }
-        mlog(0, "node %d was using slot %d\n", node_num, slot_num);
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
                if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                        tl_copy);
+                                        tl_copy, NULL);
        status = 0;
 done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        return ret;
 }
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
        /* This check is good because ocfs2 will wait on our recovery
         * thread before changing it to something other than MOUNTED
         * or DISABLED. */
        wait_event(osb->osb_mount_event,
-                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+                  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
                   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
        /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
+#include <linux/jbd2.h>
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
 enum ocfs2_journal_state {
        OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
 *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
 *                          commit the handle to disk in the process, but will
 *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
 *                          buffer. Will have to call ocfs2_journal_dirty once
 *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
 *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
 *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
 *                           the current handle commits.
@@ -248,10 +247,29 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
-int                  ocfs2_journal_access(handle_t *handle,
-                                          struct inode *inode,
+/* ocfs2_inode */
-                                          struct buffer_head *bh,
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
-                                          int type);
+                            struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type);
 /*
 * A word about the journal_access/journal_dirty "dance". It is
 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int                  ocfs2_journal_access(handle_t *handle,
 */
 int                  ocfs2_journal_dirty(handle_t *handle,
                                         struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-                                              struct buffer_head *bh);
-#endif
 /*
 *  Credit Macros:
@@ -293,6 +307,37 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+        int credits = 0;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        return credits;
+}
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -303,8 +348,11 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC           \
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
-                                         + OCFS2_INODE_UPDATE_CREDITS)
+{
+        return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -313,16 +361,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE                 \
                                         + OCFS2_TRUNCATE_LOG_UPDATE)
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+        return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
+ * group descriptor + mkdir/symlink blocks + quota update */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+static inline int ocfs2_mknod_credits(struct super_block *sb)
-                            + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+{
+        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -332,13 +387,21 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+               ocfs2_quota_trans_credits(sb);
+}
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
 * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+static inline int ocfs2_unlink_credits(struct super_block *sb)
-                              + OCFS2_LINK_CREDITS)
+{
+        /* The quota update from ocfs2_link_credits is unused here... */
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
 * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+static inline int ocfs2_rename_credits(struct super_block *sb)
-                             + OCFS2_UNLINK_CREDITS)
+{
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 /* global bitmap dinode, group desc., relinked group,
 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
         * credit for the dinode there. */
        extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
-        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+               ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = OCFS2_MKNOD_CREDITS;
+        int blocks = ocfs2_mknod_credits(sb);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
        blocks += ocfs2_clusters_to_blocks(sb, 1);
-        return blocks;
+        return blocks + ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* update to the truncate log. */
        credits += OCFS2_TRUNCATE_LOG_UPDATE;
+        credits += ocfs2_quota_trans_credits(sb);
        return credits;
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        memcpy(alloc_copy, alloc, bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
        status = ocfs2_write_block(osb, alloc_bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
         * delete bits from it! */
        *num_bits = bits_wanted;
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -61,17 +62,18 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "acl.h"
 #include "buffer_head_io.h"
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac);
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
        return ret;
 }
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+        struct inode *inode;
+        inode = new_inode(dir->i_sb);
+        if (!inode) {
+                mlog(ML_ERROR, "new_inode failed!\n");
+                return NULL;
+        }
+        /* populate as many fields early on as possible - many of
+         * these are used by the support functions here and in
+         * callers. */
+        if (S_ISDIR(mode))
+                inode->i_nlink = 2;
+        else
+                inode->i_nlink = 1;
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        vfs_dq_init(inode);
+        return inode;
+}
 static int ocfs2_mknod(struct inode *dir,
                       struct dentry *dentry,
                       int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        /* Reserve a cluster if creating an extent based directory. */
+        inode = ocfs2_get_init_inode(dir, mode);
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
-                if (status < 0) {
+                mlog_errno(status);
-                        if (status != -ENOSPC)
+                goto leave;
-                                mlog_errno(status);
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
                        goto leave;
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+        /* calculate meta data/clusters for setting security and acl xattr */
+        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+                                        &si, &want_clusters,
+                                        &xattr_credits, &xattr_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        /* Reserve a cluster if creating an extent based directory. */
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+                                   xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+         * to be called. */
+        if (sb_any_quota_active(osb->sb) &&
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota_inode = 1;
        /* do the real work now. */
-        status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
                                    &new_fe_bh, parent_fe_bh, handle,
-                                    &inode, inode_ac);
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
-                status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+                status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
                inc_nlink(dir);
        }
+        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+                                xattr_ac, data_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto leave;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
                                 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
        d_instantiate(dentry, inode);
        status = 0;
 leave:
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -331,9 +425,13 @@ leave:
        brelse(new_fe_bh);
        brelse(de_bh);
        brelse(parent_fe_bh);
+        kfree(si.name);
+        kfree(si.value);
-        if ((status < 0) && inode)
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
        mlog_exit(status);
        return status;
@@ -348,12 +449,12 @@ leave:
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac)
 {
        int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
-        struct inode *inode = NULL;
-        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-                   (unsigned long)dev, dentry->d_name.len,
+                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
                   dentry->d_name.name);
        *new_fe_bh = NULL;
-        *ret_inode = NULL;
        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
                                       &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        inode = new_inode(dir->i_sb);
-        if (!inode) {
-                status = -ENOMEM;
-                mlog(ML_ERROR, "new_inode failed!\n");
-                goto leave;
-        }
        /* populate as many fields early on as possible - many of
         * these are used by the support functions here and in
         * callers. */
        inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
        OCFS2_I(inode)->ip_blkno = fe_blkno;
-        if (S_ISDIR(mode))
-                inode->i_nlink = 2;
-        else
-                inode->i_nlink = 1;
-        inode->i_mode = mode;
        spin_lock(&osb->osb_lock);
        inode->i_generation = osb->s_next_generation++;
        spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        }
        ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
-        status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-        fe->i_uid = cpu_to_le32(current_fsuid());
+        fe->i_uid = cpu_to_le32(inode->i_uid);
-        if (dir->i_mode & S_ISGID) {
+        fe->i_gid = cpu_to_le32(inode->i_gid);
-                fe->i_gid = cpu_to_le32(dir->i_gid);
+        fe->i_mode = cpu_to_le16(inode->i_mode);
-                if (S_ISDIR(mode))
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-                        mode |= S_ISGID;
-        } else
-                fe->i_gid = cpu_to_le32(current_fsgid());
-        fe->i_mode = cpu_to_le16(mode);
-        if (S_ISCHR(mode) || S_ISBLK(mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
        fe->i_links_count = cpu_to_le16(inode->i_nlink);
        fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        /*
         * If supported, directories start with inline data.
         */
-        if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+        ocfs2_populate_inode(inode, fe, 1);
-                mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-                     "i_blkno=%llu, i_ino=%lu\n",
-                     (unsigned long long)(*new_fe_bh)->b_blocknr,
-                     (unsigned long long)le64_to_cpu(fe->i_blkno),
-                     inode->i_ino);
-                BUG();
-        }
        ocfs2_inode_set_new(osb, inode);
        if (!ocfs2_mount_local(osb)) {
                status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        status = 0; /* error in ocfs2_create_new_inode_locks is not
                     * critical */
-        *ret_inode = inode;
 leave:
        if (status < 0) {
                if (*new_fe_bh) {
                        brelse(*new_fe_bh);
                        *new_fe_bh = NULL;
                }
-                if (inode) {
-                        clear_nlink(inode);
-                        iput(inode);
-                }
        }
        mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        err = ocfs2_journal_access(handle, inode, fe_bh,
+        err = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
                mlog_errno(err);
                goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
                                goto bail;
                        }
                }
-                status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+                status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
        old_inode->i_ctime = CURRENT_TIME;
        mark_inode_dirty(old_inode);
-        status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
+        status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status >= 0) {
                old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
                             (int)old_dir_nlink, old_dir->i_nlink);
                } else {
                        struct ocfs2_dinode *fe;
-                        status = ocfs2_journal_access(handle, old_dir,
+                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                      old_dir_bh,
+                                                         old_dir_bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota = 0, did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        /* don't reserve bitmap space for fast symlinks. */
+        inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
-        if (l > ocfs2_fast_symlink_chars(sb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        /* calculate meta data/clusters for setting security xattr */
+        if (si.enable) {
+                status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+                                                  &xattr_credits, &xattr_ac);
                if (status < 0) {
-                        if (status != -ENOSPC)
+                        mlog_errno(status);
-                                mlog_errno(status);
                        goto bail;
                }
        }
-        handle = ocfs2_start_trans(osb, credits);
+        /* don't reserve bitmap space for fast symlinks. */
+        if (l > ocfs2_fast_symlink_chars(sb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        handle = ocfs2_start_trans(osb, credits + xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        status = ocfs2_mknod_locked(osb, dir, dentry,
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
-                                    S_IFLNK | S_IRWXUGO, 0,
+         * to be called. */
-                                    &new_fe_bh, parent_fe_bh, handle,
+        if (sb_any_quota_active(osb->sb) &&
-                                    &inode, inode_ac);
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto bail;
+        }
+        did_quota_inode = 1;
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+                                    0, &new_fe_bh, parent_fe_bh, handle,
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                              new_fe_bh,
                                              handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
                                 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                        ocfs2_clusters_to_bytes(osb->sb, 1));
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -1640,12 +1772,18 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
        brelse(de_bh);
+        kfree(si.name);
+        kfree(si.value);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if ((status < 0) && inode)
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        mlog_exit(status);
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        status = ocfs2_read_block(orphan_dir_inode,
+        status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
-                                  OCFS2_I(orphan_dir_inode)->ip_blkno,
-                                  &orphan_dir_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                goto leave;
        }
-        status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..077384135f4e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
        VOLUME_INIT = 0,
        VOLUME_MOUNTED,
+        VOLUME_MOUNTED_QUOTAS,
        VOLUME_DISMOUNTED,
        VOLUME_DISABLED
 };
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
+        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+        OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -205,6 +209,8 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
+struct ocfs2_dentry_lock;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -286,10 +292,11 @@ struct ocfs2_super
        char *local_alloc_debug_buf;
 #endif
-        /* Next two fields are for local node slot recovery during
+        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
        struct ocfs2_dinode *local_alloc_copy;
+        struct ocfs2_quota_recovery *quota_rec;
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
@@ -319,6 +326,11 @@ struct ocfs2_super
        struct list_head blocked_lock_list;
        unsigned long blocked_lock_count;
+        /* List of dentry locks to release. Anyone can add locks to
+         * the list, ocfs2_wq processes the list  */
+        struct ocfs2_dentry_lock *dentry_lock_list;
+        struct work_struct dentry_lock_work;
        wait_queue_head_t               osb_mount_event;
        /* Truncate log info */
@@ -333,6 +345,10 @@ struct ocfs2_super
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+                                         struct buffer_head *bh, int type);
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
        if (!S_ISREG(inode->i_mode))
@@ -376,6 +392,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +466,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)                                      \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)  do {                    \
-        typeof(__di) ____di = (__di);                                   \
-        ocfs2_error((__sb),                                             \
-                "Dinode # %llu has bad signature %.*s",                 \
-                (unsigned long long)le64_to_cpu((____di)->i_blkno), 7,  \
-                (____di)->i_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)                                \
        (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)    do {            \
-        typeof(__eb) ____eb = (__eb);                                   \
-        ocfs2_error((__sb),                                             \
-                "Extent Block # %llu has bad signature %.*s",           \
-                (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,  \
-                (____eb)->h_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)                                  \
        (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)      do {            \
-        typeof(__gd) ____gd = (__gd);                                   \
-                ocfs2_error((__sb),                                     \
-                "Group Descriptor # %llu has bad signature %.*s",       \
-                (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-                (____gd)->bg_signature);                                \
-} while (0)
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)                                 \
        (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
+        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -632,5 +635,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE    "EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -93,8 +94,11 @@
                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-                                         | OCFS2_FEATURE_INCOMPAT_XATTR)
+                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-#define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -147,6 +151,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -163,6 +170,12 @@
 */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -192,6 +205,7 @@
 #define OCFS2_HEARTBEAT_FL      (0x00000200)    /* Heartbeat area */
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
+#define OCFS2_QUOTA_FL          (0x00001000)    /* Quota file */
 /*
 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
        HEARTBEAT_SYSTEM_INODE,
        GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+        USER_QUOTA_SYSTEM_INODE,
+        GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
        ORPHAN_DIR_SYSTEM_INODE,
        EXTENT_ALLOC_SYSTEM_INODE,
        INODE_ALLOC_SYSTEM_INODE,
        JOURNAL_SYSTEM_INODE,
        LOCAL_ALLOC_SYSTEM_INODE,
        TRUNCATE_LOG_SYSTEM_INODE,
+        LOCAL_USER_QUOTA_SYSTEM_INODE,
+        LOCAL_GROUP_QUOTA_SYSTEM_INODE,
        NUM_SYSTEM_INODES
 };
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [SLOT_MAP_SYSTEM_INODE]                 = { "slot_map", 0, S_IFREG | 0644 },
        [HEARTBEAT_SYSTEM_INODE]                = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
        [GLOBAL_BITMAP_SYSTEM_INODE]            = { "global_bitmap", 0, S_IFREG | 0644 },
+        [USER_QUOTA_SYSTEM_INODE]               = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [GROUP_QUOTA_SYSTEM_INODE]              = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
        /* Slot-specific system inodes (one copy per slot) */
        [ORPHAN_DIR_SYSTEM_INODE]               = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [INODE_ALLOC_SYSTEM_INODE]              = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
        [JOURNAL_SYSTEM_INODE]                  = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
        [LOCAL_ALLOC_SYSTEM_INODE]              = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+        [LOCAL_USER_QUOTA_SYSTEM_INODE]         = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [LOCAL_GROUP_QUOTA_SYSTEM_INODE]        = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 /* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 #define OCFS2_RAW_SB(dinode)            (&((dinode)->id2.i_super))
 /*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/  __le32 bc_crc32e;       /* 802.3 Ethernet II CRC32 */
+        __le16 bc_ecc;          /* Single-error-correction parity vector.
+                                   This is a simple Hamming code dependant
+                                   on the blocksize.  OCFS2's maximum
+                                   blocksize, 4K, requires 16 parity bits,
+                                   so we fit in __le16. */
+        __le16 bc_reserved1;
+/*08*/
+};
+/*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 {
 /*00*/  __u8 h_signature[8];            /* Signature for verification */
-        __le64 h_reserved1;
+        struct ocfs2_block_check h_check;       /* Error checking */
 /*10*/  __le16 h_suballoc_slot;         /* Slot suballocator this
                                           extent_header belongs to */
        __le16 h_suballoc_bit;          /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
                                           was set in i_flags */
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
-/*80*/  __le64 i_reserved2[7];
+/*80*/  struct ocfs2_block_check i_check;       /* Error checking */
+/*88*/  __le64 i_reserved2[6];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
 } __attribute__ ((packed));
 /*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/  __le64          db_compat_inode;        /* Always zero. Was inode */
+        __le16          db_compat_rec_len;      /* Backwards compatible with
+                                                 * ocfs2_dir_entry. */
+        __u8            db_compat_name_len;     /* Always zero. Was name_len */
+        __u8            db_reserved0;
+        __le16          db_reserved1;
+        __le16          db_free_rec_len;        /* Size of largest empty hole
+                                                 * in this block. (unused) */
+/*10*/  __u8            db_signature[8];        /* Signature for verification */
+        __le64          db_reserved2;
+        __le64          db_free_next;           /* Next block in list (unused) */
+/*20*/  __le64          db_blkno;               /* Offset on disk, in blocks */
+        __le64          db_parent_dinode;       /* dinode which owns me, in
+                                                   blocks */
+/*30*/  struct ocfs2_block_check db_check;      /* Error checking */
+/*40*/
+};
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
 /*20*/  __le64   bg_parent_dinode;       /* dinode which owns me, in
                                           blocks */
        __le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/  __le64   bg_reserved2[2];
+/*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
+        __le64   bg_reserved2;
 /*40*/  __u8    bg_bitmap[0];
 };
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
                                                   in this extent record,
                                                   only valid in the first
                                                   bucket. */
-        __le64  xh_csum;
+        struct ocfs2_block_check xh_check;      /* Error checking
+                                                   (Note, this is only
+                                                    used for xattr
+                                                    buckets.  A block uses
+                                                    xb_check and sets
+                                                    this field to zero.) */
        struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
                                        block group */
        __le32  xb_fs_generation;    /* Must match super block */
 /*10*/  __le64  xb_blkno;            /* Offset on disk, in blocks */
-        __le64  xb_csum;
+        struct ocfs2_block_check xb_check;      /* Error checking */
 /*20*/  __le16  xb_flags;            /* Indicates whether this block contains
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
        return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
+/*
+ *  On disk structures for global quota file
+ */
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+        0x0cf52470, /* USRQUOTA */ \
+        0x0cf52471  /* GRPQUOTA */ \
+}
+#define OCFS2_GLOBAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* Quota format version */
+};
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/  __le32 dqi_bgrace;      /* Grace time for space softlimit excess */
+        __le32 dqi_igrace;      /* Grace time for inode softlimit excess */
+        __le32 dqi_syncms;      /* Time after which we sync local changes to
+                                 * global quota file */
+        __le32 dqi_blocks;      /* Number of blocks in quota file */
+/*10*/  __le32 dqi_free_blk;    /* First free block in quota file */
+        __le32 dqi_free_entry;  /* First block with free dquot entry in quota
+                                 * file */
+};
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/  __le32 dqb_id;          /* ID the structure belongs to */
+        __le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+        __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/  __le64 dqb_isoftlimit;  /* preferred inode limit */
+        __le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/  __le64 dqb_bhardlimit;  /* absolute limit on disk space */
+        __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/  __le64 dqb_curspace;    /* current space occupied */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/  __le64 dqb_itime;       /* time limit for excessive inode use */
+        __le64 dqb_pad1;
+/*50*/  __le64 dqb_pad2;
+};
+/*
+ *  On-disk structures for local quota file
+ */
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+        0x0cf524c0, /* USRQUOTA */ \
+        0x0cf524c1  /* GRPQUOTA */ \
+}
+#define OCFS2_LOCAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN      0x0001  /* Quota file is empty (this should be after\
+                                 * quota has been cleanly turned off) */
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+        __le32 dqi_flags;       /* Flags for quota file */
+        __le32 dqi_chunks;      /* Number of chunks of quota structures
+                                 * with a bitmap */
+        __le32 dqi_blocks;      /* Number of blocks allocated for quota file */
+};
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+        __le32 dqc_free;        /* Number of free entries in the bitmap */
+        u8 dqc_bitmap[0];       /* Bitmap of entries in the corresponding
+                                 * chunk of quota file */
+};
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/  __le64 dqb_id;          /* id this quota applies to */
+        __le64 dqb_spacemod;    /* Change in the amount of used space */
+/*10*/  __le64 dqb_inodemod;    /* Change in the amount of used inodes */
+};
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+struct ocfs2_disk_dqtrailer {
+/*00*/  struct ocfs2_block_check dq_check;      /* Error checking */
+/*08*/  /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+                                                                 void *buf)
+{
+        char *ptr = buf;
+        ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+        return (struct ocfs2_disk_dqtrailer *)ptr;
+}
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-struct jbd2_inode {
-        unsigned int dummy;
-};
-#define JBD2_BARRIER                    JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE     JBD_DEFAULT_MAX_COMMIT_AGE
-#define jbd2_journal_ack_err                    journal_ack_err
-#define jbd2_journal_clear_err                  journal_clear_err
-#define jbd2_journal_destroy                    journal_destroy
-#define jbd2_journal_dirty_metadata             journal_dirty_metadata
-#define jbd2_journal_errno                      journal_errno
-#define jbd2_journal_extend                     journal_extend
-#define jbd2_journal_flush                      journal_flush
-#define jbd2_journal_force_commit               journal_force_commit
-#define jbd2_journal_get_write_access           journal_get_write_access
-#define jbd2_journal_get_undo_access            journal_get_undo_access
-#define jbd2_journal_init_inode                 journal_init_inode
-#define jbd2_journal_invalidatepage             journal_invalidatepage
-#define jbd2_journal_load                       journal_load
-#define jbd2_journal_lock_updates               journal_lock_updates
-#define jbd2_journal_restart                    journal_restart
-#define jbd2_journal_start                      journal_start
-#define jbd2_journal_start_commit               journal_start_commit
-#define jbd2_journal_stop                       journal_stop
-#define jbd2_journal_try_to_free_buffers        journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates             journal_unlock_updates
-#define jbd2_journal_wipe                       journal_wipe
-#define jbd2_log_wait_commit                    log_wait_commit
-static inline int jbd2_journal_file_inode(handle_t *handle,
-                                          struct jbd2_inode *inode)
-{
-        return 0;
-}
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-                                                      loff_t new_size)
-{
-        return 0;
-}
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-                                               struct inode *inode)
-{
-        return;
-}
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-                                                  struct jbd2_inode *jinode)
-{
-        return;
-}
-#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
+        OCFS2_LOCK_TYPE_QINFO,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_FLOCK:
                        c = 'F';
                        break;
+                case OCFS2_LOCK_TYPE_QINFO:
+                        c = 'Q';
+                        break;
                default:
                        c = '\0';
        }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+#include "ocfs2.h"
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+        struct dquot dq_dquot;  /* Generic VFS dquot */
+        loff_t dq_local_off;    /* Offset in the local quota file */
+        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
+        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
+        s64 dq_origspace;       /* Last globally synced space usage */
+        s64 dq_originodes;      /* Last globally synced inode usage */
+};
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+        struct list_head rc_list;       /* List of chunks */
+        int rc_chunk;                   /* Chunk number */
+        unsigned long *rc_bitmap;       /* Bitmap of entries to recover */
+};
+struct ocfs2_quota_recovery {
+        struct list_head r_list[MAXQUOTAS];     /* List of chunks to recover */
+};
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
+        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
+        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
+        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
+        struct list_head dqi_chunk;     /* List of chunks */
+        struct inode *dqi_gqinode;      /* Global quota file inode */
+        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
+        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
+        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
+                                                 * information, in case we
+                                                 * enable quotas on file
+                                                 * needing it */
+};
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+        return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+struct ocfs2_quota_chunk {
+        struct list_head qc_chunk;      /* List of quotafile chunks */
+        int qc_num;                     /* Number of quota chunk */
+        struct buffer_head *qc_headerbh;        /* Buffer head with chunk header */
+};
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+extern struct qtree_fmt_operations ocfs2_global_ops;
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 1);
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh);
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..1ed0f7c86869
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,862 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+static void qsync_work_fn(struct work_struct *work);
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        /* Update from disk only entries not set by the admin */
+        if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+                m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+                m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+        if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+                m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+                m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+        if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+                m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+                m->dqb_itime = le64_to_cpu(d->dqb_itime);
+        OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
+        d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+        d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+        d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+        d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+        d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+        d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+        d->dqb_btime = cpu_to_le64(m->dqb_btime);
+        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+                return 0;
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+struct qtree_fmt_operations ocfs2_global_ops = {
+        .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+        .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+        .is_id = ocfs2_global_is_id,
+};
+static int ocfs2_validate_quota_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+        mlog(0, "Validating quota block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+                                 struct buffer_head **bh)
+{
+        u64 pblock, pcount;
+        int err;
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        *bh = sb_getblk(inode->i_sb, pblock);
+        if (!*bh) {
+                err = -EIO;
+                mlog_errno(err);
+        }
+        return err;;
+}
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        loff_t i_size = i_size_read(gqinode);
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0;
+        struct buffer_head *bh;
+        size_t toread, tocopy;
+        if (off > i_size)
+                return 0;
+        if (off + len > i_size)
+                len = i_size - off;
+        toread = len;
+        while (toread > 0) {
+                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                bh = NULL;
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                if (err) {
+                        mlog_errno(err);
+                        return err;
+                }
+                memcpy(data, bh->b_data + offset, tocopy);
+                brelse(bh);
+                offset = 0;
+                toread -= tocopy;
+                data += tocopy;
+                blk++;
+        }
+        return len;
+}
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0, new = 0, ja_type;
+        struct buffer_head *bh = NULL;
+        handle_t *handle = journal_current_handle();
+        if (!handle) {
+                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+                     "because transaction was not started.\n",
+                     (unsigned long long)off, (unsigned long long)len);
+                return -EIO;
+        }
+        if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+                WARN_ON(1);
+                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+        }
+        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+        if (gqinode->i_size < off + len) {
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
+                err = ocfs2_simple_size_update(gqinode,
+                                               oinfo->dqi_gqi_bh,
+                                               off + len);
+                if (err < 0)
+                        goto out;
+                new = 1;
+        }
+        /* Not rewriting whole block? */
+        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+            !new) {
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        } else {
+                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+        }
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        lock_buffer(bh);
+        if (new)
+                memset(bh->b_data, 0, sb->s_blocksize);
+        memcpy(bh->b_data + offset, data, len);
+        flush_dcache_page(bh->b_page);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        ocfs2_set_buffer_uptodate(gqinode, bh);
+        err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+        if (err < 0) {
+                brelse(bh);
+                goto out;
+        }
+        err = ocfs2_journal_dirty(handle, bh);
+        brelse(bh);
+        if (err < 0)
+                goto out;
+out:
+        if (err) {
+                mutex_unlock(&gqinode->i_mutex);
+                mlog_errno(err);
+                return err;
+        }
+        gqinode->i_version++;
+        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+        mutex_unlock(&gqinode->i_mutex);
+        return len;
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        int status;
+        struct buffer_head *bh = NULL;
+        status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+        if (status < 0)
+                return status;
+        spin_lock(&dq_data_lock);
+        if (!oinfo->dqi_gqi_count++)
+                oinfo->dqi_gqi_bh = bh;
+        else
+                WARN_ON(bh != oinfo->dqi_gqi_bh);
+        spin_unlock(&dq_data_lock);
+        return 0;
+}
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+        brelse(oinfo->dqi_gqi_bh);
+        spin_lock(&dq_data_lock);
+        if (!--oinfo->dqi_gqi_count)
+                oinfo->dqi_gqi_bh = NULL;
+        spin_unlock(&dq_data_lock);
+}
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+        struct inode *gqinode = NULL;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct ocfs2_global_disk_dqinfo dinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        int status;
+        mlog_entry_void();
+        /* Read global header */
+        gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                        OCFS2_INVALID_SLOT);
+        if (!gqinode) {
+                mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+                        type);
+                status = -EINVAL;
+                goto out_err;
+        }
+        oinfo->dqi_gi.dqi_sb = sb;
+        oinfo->dqi_gi.dqi_type = type;
+        ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+        oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+        oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+        oinfo->dqi_gqi_bh = NULL;
+        oinfo->dqi_gqi_count = 0;
+        oinfo->dqi_gqinode = gqinode;
+        status = ocfs2_lock_global_qf(oinfo, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+                                      sizeof(struct ocfs2_global_disk_dqinfo),
+                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_unlock_global_qf(oinfo, 0);
+        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+                     status);
+                if (status >= 0)
+                        status = -EIO;
+                mlog_errno(status);
+                goto out_err;
+        }
+        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+        oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+                                                OCFS2_QBLK_RESERVED_SPACE;
+        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_global_disk_dqinfo dinfo;
+        ssize_t size;
+        spin_lock(&dq_data_lock);
+        info->dqi_flags &= ~DQF_INFO_DIRTY;
+        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+        spin_unlock(&dq_data_lock);
+        dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+        dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+        dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+        dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+                                     sizeof(struct ocfs2_global_disk_dqinfo),
+                                     OCFS2_GLOBAL_INFO_OFF);
+        if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot write global quota info structure\n");
+                if (size >= 0)
+                        size = -EIO;
+                return size;
+        }
+        return 0;
+}
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        int err;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 1);
+        if (err < 0)
+                return err;
+        err = __ocfs2_global_write_info(sb, type);
+        ocfs2_qinfo_unlock(info, 1);
+        return err;
+}
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+        int err, err2, ex = 0;
+        struct ocfs2_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 0);
+        if (err < 0)
+                goto out;
+        err = qtree_read_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                /* Upgrade to exclusive lock for allocation */
+                err = ocfs2_qinfo_lock(info, 1);
+                if (err < 0)
+                        goto out_qlock;
+                ex = 1;
+        }
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+                if (!err)
+                        err = err2;
+        }
+out_qlock:
+        if (ex)
+                ocfs2_qinfo_unlock(info, 1);
+        ocfs2_qinfo_unlock(info, 0);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+        int err, err2;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_global_disk_dqblk dqblk;
+        s64 spacechange, inodechange;
+        time_t olditime, oldbtime;
+        err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+                                   sizeof(struct ocfs2_global_disk_dqblk),
+                                   dquot->dq_off);
+        if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+                if (err >= 0) {
+                        mlog(ML_ERROR, "Short read from global quota file "
+                                       "(%u read)\n", err);
+                        err = -EIO;
+                }
+                goto out;
+        }
+        /* Update space and inode usage. Get also other information from
+         * global quota file so that we don't overwrite any changes there.
+         * We are */
+        spin_lock(&dq_data_lock);
+        spacechange = dquot->dq_dqb.dqb_curspace -
+                                        OCFS2_DQUOT(dquot)->dq_origspace;
+        inodechange = dquot->dq_dqb.dqb_curinodes -
+                                        OCFS2_DQUOT(dquot)->dq_originodes;
+        olditime = dquot->dq_dqb.dqb_itime;
+        oldbtime = dquot->dq_dqb.dqb_btime;
+        ocfs2_global_disk2memdqb(dquot, &dqblk);
+        mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+             dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+             dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curspace += spacechange;
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curinodes += inodechange;
+        /* Set properly space grace time... */
+        if (dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+                    oldbtime > 0) {
+                        if (dquot->dq_dqb.dqb_btime > 0)
+                                dquot->dq_dqb.dqb_btime =
+                                        min(dquot->dq_dqb.dqb_btime, oldbtime);
+                        else
+                                dquot->dq_dqb.dqb_btime = oldbtime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_btime = 0;
+                clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+        }
+        /* Set properly inode grace time... */
+        if (dquot->dq_dqb.dqb_isoftlimit &&
+            dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+                    olditime > 0) {
+                        if (dquot->dq_dqb.dqb_itime > 0)
+                                dquot->dq_dqb.dqb_itime =
+                                        min(dquot->dq_dqb.dqb_itime, olditime);
+                        else
+                                dquot->dq_dqb.dqb_itime = olditime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_itime = 0;
+                clear_bit(DQ_INODES_B, &dquot->dq_flags);
+        }
+        /* All information is properly updated, clear the flags */
+        __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        spin_unlock(&dq_data_lock);
+        err = ocfs2_qinfo_lock(info, freeing);
+        if (err < 0) {
+                mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+                               " (type=%d, id=%u)\n", dquot->dq_type,
+                               (unsigned)dquot->dq_id);
+                goto out;
+        }
+        if (freeing)
+                OCFS2_DQUOT(dquot)->dq_use_count--;
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+                err = qtree_release_dquot(&info->dqi_gi, dquot);
+                if (info_dirty(sb_dqinfo(sb, type))) {
+                        err2 = __ocfs2_global_write_info(sb, type);
+                        if (!err)
+                                err = err2;
+                }
+        }
+out_qlock:
+        ocfs2_qinfo_unlock(info, freeing);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+        handle_t *handle;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int status = 0;
+        mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+                   dquot->dq_type, type, sb->s_id);
+        if (type != dquot->dq_type)
+                goto out;
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+        status = ocfs2_sync_dquot(dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+        if (status < 0)
+                mlog_errno(status);
+        /* We have to write local structure as well... */
+        dquot_mark_dquot_dirty(dquot);
+        status = dquot_commit(dquot);
+        if (status < 0)
+                mlog_errno(status);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static void qsync_work_fn(struct work_struct *work)
+{
+        struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+                                                      struct ocfs2_mem_dqinfo,
+                                                      dqi_sync_work.work);
+        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+}
+/*
+ *  Wrappers for generic quota functions
+ */
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = dquot_commit(dquot);
+        ocfs2_commit_trans(osb, handle);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /* We modify tree, leaf block, global info, local chunk header,
+         * global and local inode */
+        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+               2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_release(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        struct ocfs2_dinode *lfe, *gfe;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+        /* We can extend local file + global file. In local file we
+         * can modify info, chunk header block and dquot block. In
+         * global file we can modify info, tree and leaf block */
+        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        /* We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure */
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_acquire(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+        unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+        int sync = 0;
+        int status;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+        dquot_mark_dquot_dirty(dquot);
+        /* In case user set some limits, sync dquot immediately to global
+         * quota file so that information propagates quicker */
+        spin_lock(&dq_data_lock);
+        if (dquot->dq_flags & mask)
+                sync = 1;
+        spin_unlock(&dq_data_lock);
+        /* This is a slight hack but we can't afford getting global quota
+         * lock if we already have a transaction started. */
+        if (!sync || journal_current_handle()) {
+                status = ocfs2_write_dquot(dquot);
+                goto out;
+        }
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = ocfs2_sync_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Now write updated local dquot structure */
+        status = dquot_commit(dquot);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+        handle_t *handle;
+        int status = 0;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        mlog_entry_void();
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_commit_info(sb, type);
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+        struct ocfs2_dquot *dquot =
+                                kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+        if (!dquot)
+                return NULL;
+        return &dquot->dq_dquot;
+}
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+        kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+struct dquot_operations ocfs2_quota_operations = {
+        .initialize     = dquot_initialize,
+        .drop           = dquot_drop,
+        .alloc_space    = dquot_alloc_space,
+        .alloc_inode    = dquot_alloc_inode,
+        .free_space     = dquot_free_space,
+        .free_inode     = dquot_free_inode,
+        .transfer       = dquot_transfer,
+        .write_dquot    = ocfs2_write_dquot,
+        .acquire_dquot  = ocfs2_acquire_dquot,
+        .release_dquot  = ocfs2_release_dquot,
+        .mark_dirty     = ocfs2_mark_dquot_dirty,
+        .write_info     = ocfs2_write_info,
+        .alloc_dquot    = ocfs2_alloc_dquot,
+        .destroy_dquot  = ocfs2_destroy_dquot,
+};
+int ocfs2_quota_setup(void)
+{
+        ocfs2_quota_wq = create_workqueue("o2quot");
+        if (!ocfs2_quota_wq)
+                return -ENOMEM;
+        return 0;
+}
+void ocfs2_quota_shutdown(void)
+{
+        if (ocfs2_quota_wq) {
+                flush_workqueue(ocfs2_quota_wq);
+                destroy_workqueue(ocfs2_quota_wq);
+                ocfs2_quota_wq = NULL;
+        }
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+        return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+                sizeof(struct ocfs2_local_disk_dqblk));
+}
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+        return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+                 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+               ol_quota_entries_per_block(sb);
+}
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+        return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+        /* 1 block for local quota file info, 1 block per chunk for chunk info */
+        return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+        return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+               ol_dqblk_block_off(sb, c, off);
+}
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+        return off >> sb->s_blocksize_bits;
+}
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+        return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ((off >> sb->s_blocksize_bits) -
+                        ol_quota_chunk_block(sb, c) - 1) * epb
+               + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+                 sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+                void (*modify)(struct buffer_head *, void *), void *private)
+{
+        struct super_block *sb = inode->i_sb;
+        handle_t *handle;
+        int status;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                return status;
+        }
+        status = ocfs2_journal_access_dq(handle, inode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        lock_buffer(bh);
+        modify(bh, private);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        return 0;
+}
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+        unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+        unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+        unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+        unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct buffer_head *bh = NULL;
+        struct inode *linode = sb_dqopt(sb)->files[type];
+        struct inode *ginode = NULL;
+        struct ocfs2_disk_dqheader *dqhead;
+        int status, ret = 0;
+        /* First check whether we understand local quota file */
+        status = ocfs2_read_quota_block(linode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+                        type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+                mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+                        lmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+                mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+                        lversions[type], type);
+                goto out_err;
+        }
+        brelse(bh);
+        bh = NULL;
+        /* Next check whether we understand global quota file */
+        ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                                                OCFS2_INVALID_SLOT);
+        if (!ginode) {
+                mlog(ML_ERROR, "cannot get global quota file inode "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        /* Since the header is read only, we don't care about locking */
+        status = ocfs2_read_quota_block(ginode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read global quota file header "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+                mlog(ML_ERROR, "global quota file magic does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+                mlog(ML_ERROR, "global quota file version does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_version), gversions[type],
+                        type);
+                goto out_err;
+        }
+        ret = 1;
+out_err:
+        brelse(bh);
+        iput(ginode);
+        return ret;
+}
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+        struct ocfs2_quota_chunk *pos, *next;
+        list_for_each_entry_safe(pos, next, head, qc_chunk) {
+                list_del(&pos->qc_chunk);
+                brelse(pos->qc_headerbh);
+                kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+        }
+}
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+                        struct ocfs2_local_disk_dqinfo *ldinfo,
+                        struct list_head *head)
+{
+        struct ocfs2_quota_chunk *newchunk;
+        int i, status;
+        INIT_LIST_HEAD(head);
+        for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+                newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+                if (!newchunk) {
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return -ENOMEM;
+                }
+                newchunk->qc_num = i;
+                newchunk->qc_headerbh = NULL;
+                status = ocfs2_read_quota_block(inode,
+                                ol_quota_chunk_block(inode->i_sb, i),
+                                &newchunk->qc_headerbh);
+                if (status) {
+                        mlog_errno(status);
+                        kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return status;
+                }
+                list_add_tail(&newchunk->qc_chunk, head);
+        }
+        return 0;
+}
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+        struct mem_dqinfo *info = private;
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        spin_lock(&dq_data_lock);
+        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+        spin_unlock(&dq_data_lock);
+}
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+                                    struct ocfs2_local_disk_chunk *dchunk,
+                                    int chunk,
+                                    struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *rc;
+        rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        rc->rc_chunk = chunk;
+        rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+        if (!rc->rc_bitmap) {
+                kfree(rc);
+                return -ENOMEM;
+        }
+        memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+               (ol_chunk_entries(sb) + 7) >> 3);
+        list_add_tail(&rc->rc_list, head);
+        return 0;
+}
+static void free_recovery_list(struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *next;
+        struct ocfs2_recovery_chunk *rchunk;
+        list_for_each_entry_safe(rchunk, next, head, rc_list) {
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+        }
+}
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+        int type;
+        for (type = 0; type < MAXQUOTAS; type++)
+                free_recovery_list(&(rec->r_list[type]));
+        kfree(rec);
+}
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+                                     struct ocfs2_local_disk_dqinfo *ldinfo,
+                                     int type,
+                                     struct list_head *head)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct buffer_head *hbh;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        int status = 0;
+        for (i = 0; i < chunks; i++) {
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, i),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+                        status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+                brelse(hbh);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                free_recovery_list(head);
+        return status;
+}
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+        int type;
+        struct ocfs2_quota_recovery *rec;
+        rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+        if (!rec)
+                return NULL;
+        for (type = 0; type < MAXQUOTAS; type++)
+                INIT_LIST_HEAD(&(rec->r_list[type]));
+        return rec;
+}
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                                struct ocfs2_super *osb,
+                                                int slot_num)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct inode *lqinode;
+        struct buffer_head *bh;
+        int type;
+        int status = 0;
+        struct ocfs2_quota_recovery *rec;
+        mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+        rec = ocfs2_alloc_quota_recovery();
+        if (!rec)
+                return ERR_PTR(-ENOMEM);
+        /* First init... */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                /* At this point, journal of the slot is already replayed so
+                 * we can trust metadata and data of the quota file */
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                               OCFS2_META_LOCK_RECOVERY);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        if (status < 0) {
+                ocfs2_free_quota_recovery(rec);
+                rec = ERR_PTR(status);
+        }
+        return rec;
+}
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+                                          int type,
+                                          struct ocfs2_quota_recovery *rec)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_local_disk_chunk *dchunk;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct dquot *dquot;
+        handle_t *handle;
+        struct buffer_head *hbh = NULL, *qbh = NULL;
+        int status = 0;
+        int bit, chunk;
+        struct ocfs2_recovery_chunk *rchunk, *next;
+        qsize_t spacechange, inodechange;
+        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+                chunk = rchunk->rc_chunk;
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, chunk),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+                        qbh = NULL;
+                        status = ocfs2_read_quota_block(lqinode,
+                                                ol_dqblk_block(sb, chunk, bit),
+                                                &qbh);
+                        if (status) {
+                                mlog_errno(status);
+                                break;
+                        }
+                        dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+                                ol_dqblk_block_off(sb, chunk, bit));
+                        dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+                        if (!dquot) {
+                                status = -EIO;
+                                mlog(ML_ERROR, "Failed to get quota structure "
+                                     "for id %u, type %d. Cannot finish quota "
+                                     "file recovery.\n",
+                                     (unsigned)le64_to_cpu(dqblk->dqb_id),
+                                     type);
+                                goto out_put_bh;
+                        }
+                        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                                   OCFS2_QSYNC_CREDITS);
+                        if (IS_ERR(handle)) {
+                                status = PTR_ERR(handle);
+                                mlog_errno(status);
+                                goto out_put_dquot;
+                        }
+                        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+                        spin_lock(&dq_data_lock);
+                        /* Add usage from quota entry into quota changes
+                         * of our node. Auxiliary variables are important
+                         * due to signedness */
+                        spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+                        inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+                        dquot->dq_dqb.dqb_curspace += spacechange;
+                        dquot->dq_dqb.dqb_curinodes += inodechange;
+                        spin_unlock(&dq_data_lock);
+                        /* We want to drop reference held by the crashed
+                         * node. Since we have our own reference we know
+                         * global structure actually won't be freed. */
+                        status = ocfs2_global_release_dquot(dquot);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        /* Release local quota file entry */
+                        status = ocfs2_journal_access_dq(handle, lqinode,
+                                        qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        lock_buffer(qbh);
+                        WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+                        le32_add_cpu(&dchunk->dqc_free, 1);
+                        unlock_buffer(qbh);
+                        status = ocfs2_journal_dirty(handle, qbh);
+                        if (status < 0)
+                                mlog_errno(status);
+out_commit:
+                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+                        dqput(dquot);
+out_put_bh:
+                        brelse(qbh);
+                        if (status < 0)
+                                break;
+                }
+                brelse(hbh);
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+                if (status < 0)
+                        break;
+        }
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        if (status < 0)
+                free_recovery_list(&(rec->r_list[type]));
+        mlog_exit(status);
+        return status;
+}
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num)
+{
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct buffer_head *bh;
+        handle_t *handle;
+        int type;
+        int status = 0;
+        struct inode *lqinode;
+        unsigned int flags;
+        mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (list_empty(&(rec->r_list[type])))
+                        continue;
+                mlog(0, "Recovering quota in slot %d\n", slot_num);
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                                       OCFS2_META_LOCK_NOQUEUE);
+                /* Someone else is holding the lock? Then he must be
+                 * doing the recovery. Just skip the file... */
+                if (status == -EAGAIN) {
+                        mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+                             "because quota file is locked.\n", slot_num);
+                        status = 0;
+                        goto out_put;
+                } else if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                /* Is recovery still needed? */
+                flags = le32_to_cpu(ldinfo->dqi_flags);
+                if (!(flags & OLQF_CLEAN))
+                        status = ocfs2_recover_local_quota_file(lqinode,
+                                                                type,
+                                                                rec);
+                /* We don't want to mark file as clean when it is actually
+                 * active */
+                if (slot_num == osb->slot_num)
+                        goto out_bh;
+                /* Mark quota file as clean if we are recovering quota file of
+                 * some other node. */
+                handle = ocfs2_start_trans(osb, 1);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out_bh;
+                }
+                status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_trans;
+                }
+                lock_buffer(bh);
+                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+                unlock_buffer(bh);
+                status = ocfs2_journal_dirty(handle, bh);
+                if (status < 0)
+                        mlog_errno(status);
+out_trans:
+                ocfs2_commit_trans(osb, handle);
+out_bh:
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        kfree(rec);
+        return status;
+}
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        int status;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_quota_recovery *rec;
+        int locked = 0;
+        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+        if (!oinfo) {
+                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+                               " info.");
+                goto out_err;
+        }
+        info->dqi_priv = oinfo;
+        oinfo->dqi_type = type;
+        INIT_LIST_HEAD(&oinfo->dqi_chunk);
+        oinfo->dqi_rec = NULL;
+        oinfo->dqi_lqi_bh = NULL;
+        oinfo->dqi_ibh = NULL;
+        status = ocfs2_global_read_info(sb, type);
+        if (status < 0)
+                goto out_err;
+        status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        locked = 1;
+        /* Now read local header */
+        status = ocfs2_read_quota_block(lqinode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file info header "
+                        "(type=%d)\n", type);
+                goto out_err;
+        }
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+        oinfo->dqi_ibh = bh;
+        /* We crashed when using local quota file? */
+        if (!(info->dqi_flags & OLQF_CLEAN)) {
+                rec = OCFS2_SB(sb)->quota_rec;
+                if (!rec) {
+                        rec = ocfs2_alloc_quota_recovery();
+                        if (!rec) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                goto out_err;
+                        }
+                        OCFS2_SB(sb)->quota_rec = rec;
+                }
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_err;
+                }
+        }
+        status = ocfs2_load_local_quota_bitmaps(lqinode,
+                                                ldinfo,
+                                                &oinfo->dqi_chunk);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now mark quota file as used */
+        info->dqi_flags &= ~OLQF_CLEAN;
+        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        return 0;
+out_err:
+        if (oinfo) {
+                iput(oinfo->dqi_gqinode);
+                ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+                ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+                brelse(oinfo->dqi_lqi_bh);
+                if (locked)
+                        ocfs2_inode_unlock(lqinode, 1);
+                ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+                kfree(oinfo);
+        }
+        brelse(bh);
+        return -1;
+}
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+                                                ->dqi_ibh;
+        int status;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                return -1;
+        }
+        return 0;
+}
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int mark_clean = 1, len;
+        int status;
+        /* At this point we know there are no more dquots and thus
+         * even if there's some sync in the pdflush queue, it won't
+         * find any dquots and return without doing anything */
+        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+        iput(oinfo->dqi_gqinode);
+        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                        (chunk->qc_headerbh->b_data);
+                if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                        len = ol_chunk_entries(sb);
+                } else {
+                        len = (oinfo->dqi_blocks -
+                               ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                              * ol_quota_entries_per_block(sb);
+                }
+                /* Not all entries free? Bug! */
+                if (le32_to_cpu(dchunk->dqc_free) != len) {
+                        mlog(ML_ERROR, "releasing quota file with used "
+                                        "entries (type=%d)\n", type);
+                        mark_clean = 0;
+                }
+        }
+        ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+        /* dqonoff_mutex protects us against racing with recovery thread... */
+        if (oinfo->dqi_rec) {
+                ocfs2_free_quota_recovery(oinfo->dqi_rec);
+                mark_clean = 0;
+        }
+        if (!mark_clean)
+                goto out;
+        /* Mark local file as clean */
+        info->dqi_flags |= OLQF_CLEAN;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+                                 oinfo->dqi_ibh,
+                                 olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_lqi_bh);
+        kfree(oinfo);
+        return 0;
+}
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+        struct ocfs2_dquot *od = private;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct super_block *sb = od->dq_dquot.dq_sb;
+        dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+                + ol_dqblk_block_offset(sb, od->dq_local_off));
+        dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+        spin_lock(&dq_data_lock);
+        dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+                                          od->dq_origspace);
+        dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+                                          od->dq_originodes);
+        spin_unlock(&dq_data_lock);
+        mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+             od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+             (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct buffer_head *bh = NULL;
+        int status;
+        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                    &bh);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+                                 olq_set_dquot, od);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        brelse(bh);
+        return status;
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int found = 0, len;
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                                chunk->qc_headerbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) > 0) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                return NULL;
+        if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                len = ol_chunk_entries(sb);
+        } else {
+                len = (oinfo->dqi_blocks -
+                       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                      * ol_quota_entries_per_block(sb);
+        }
+        found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+        /* We failed? */
+        if (found == len) {
+                mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+                     " entries free (type=%d)\n", chunk->qc_num,
+                     le32_to_cpu(dchunk->dqc_free), type);
+                return ERR_PTR(-EIO);
+        }
+        *offset = found;
+        return chunk;
+}
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+                                                        struct super_block *sb,
+                                                        int type,
+                                                        int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk = NULL;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int status;
+        handle_t *handle;
+        struct buffer_head *bh = NULL;
+        u64 p_blkno;
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + 2 * sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + 2 * sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+        if (!chunk) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+        memset(dchunk->dqc_bitmap, 0,
+               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+               OCFS2_QBLK_RESERVED_SPACE);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks += 2;
+        oinfo->dqi_chunks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+        chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+                                   struct ocfs2_quota_chunk,
+                                   qc_chunk)->qc_num + 1;
+        chunk->qc_headerbh = bh;
+        *offset = 0;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        brelse(bh);
+        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+        return ERR_PTR(status);
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+                                                       struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_local_disk_chunk *dchunk;
+        int epb = ol_quota_entries_per_block(sb);
+        unsigned int chunk_blocks;
+        int status;
+        handle_t *handle;
+        if (list_empty(&oinfo->dqi_chunk))
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* Is the last chunk full? */
+        chunk = list_entry(oinfo->dqi_chunk.prev,
+                        struct ocfs2_quota_chunk, qc_chunk);
+        chunk_blocks = oinfo->dqi_blocks -
+                        ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+        if (ol_chunk_blocks(sb) == chunk_blocks)
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+                                 OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+        lock_buffer(chunk->qc_headerbh);
+        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+        unlock_buffer(chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        *offset = chunk_blocks * epb;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        return ERR_PTR(status);
+}
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+        int *offset = private;
+        struct ocfs2_local_disk_chunk *dchunk;
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, -1);
+}
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        int offset;
+        int status;
+        chunk = ocfs2_find_free_entry(sb, type, &offset);
+        if (!chunk) {
+                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+                if (IS_ERR(chunk))
+                        return PTR_ERR(chunk);
+        } else if (IS_ERR(chunk)) {
+                return PTR_ERR(chunk);
+        }
+        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+        od->dq_chunk = chunk;
+        /* Initialize dquot structure on disk */
+        status = ocfs2_local_write_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        /* Mark structure as allocated */
+        status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+                                 &offset);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        return status;
+}
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+        int status;
+        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_global_read_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now create entry in the local quota file */
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        mlog_exit(0);
+        return 0;
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+        int status;
+        int type = dquot->dq_type;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int offset;
+        handle_t *handle = journal_current_handle();
+        BUG_ON(!handle);
+        /* First write all local changes to global file */
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+                        od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+                                             od->dq_local_off);
+        dchunk = (struct ocfs2_local_disk_chunk *)
+                        (od->dq_chunk->qc_headerbh->b_data);
+        /* Mark structure as freed */
+        lock_buffer(od->dq_chunk->qc_headerbh);
+        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, 1);
+        unlock_buffer(od->dq_chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = 0;
+out:
+        /* Clear the read bit so that next time someone uses this
+         * dquot he reads fresh info from disk and allocates local
+         * dquot structure */
+        clear_bit(DQ_READ_B, &dquot->dq_flags);
+        return status;
+}
+static struct quota_format_ops ocfs2_format_ops = {
+        .check_quota_file       = ocfs2_local_check_quota_file,
+        .read_file_info         = ocfs2_local_read_info,
+        .write_file_info        = ocfs2_global_write_info,
+        .free_file_info         = ocfs2_local_free_info,
+        .read_dqblk             = ocfs2_local_read_dquot,
+        .commit_dqblk           = ocfs2_local_write_dquot,
+        .release_dqblk          = ocfs2_local_release_dquot,
+};
+struct quota_format_type ocfs2_quota_format = {
+        .qf_fmt_id = QFMT_OCFS2,
+        .qf_ops = &ocfs2_format_ops,
+        .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
                   new_clusters, first_new_cluster);
-        ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        }
        /* update the inode accordingly. */
-        ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+        ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+         * so any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
                goto out_unlock;
        }
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-                ret = -EIO;
-                goto out_unlock;
-        }
        first_new_cluster = le32_to_cpu(fe->i_clusters);
        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
                                              first_new_cluster - 1);
-        ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        group = (struct ocfs2_group_desc *)group_bh->b_data;
-        ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_unlock;
-        }
        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
        if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
                le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
                                 struct buffer_head *group_bh)
 {
        int ret;
-        struct ocfs2_group_desc *gd;
+        struct ocfs2_group_desc *gd =
+                (struct ocfs2_group_desc *)group_bh->b_data;
        u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-        unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-                                le16_to_cpu(di->id2.i_chain.cl_bpc);
-        gd = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+        if (ret)
+                goto out;
-        ret = -EIO;
+        ret = -EINVAL;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+        if (le16_to_cpu(gd->bg_chain) != input->chain)
-                mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-        else if (di->i_blkno != gd->bg_parent_dinode)
-                mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-                     "pointer (%llu, expected %llu)\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                     (unsigned long long)le64_to_cpu(di->i_blkno));
-        else if (le16_to_cpu(gd->bg_bits) > max_bits)
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits));
-        else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "claims that %u are free\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     le16_to_cpu(gd->bg_free_bits_count));
-        else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "max bitmap bits of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     8 * le16_to_cpu(gd->bg_size));
-        else if (le16_to_cpu(gd->bg_chain) != input->chain)
                mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
                     "while input has %u set.\n",
                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
        else
                ret = 0;
+out:
        return ret;
 }
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        cl = &fe->id2.i_chain;
        cr = &cl->cl_recs[input->chain];
-        ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out_commit;
        }
-        ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+        ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
         * this is not true, the read of -1 (UINT64_MAX) will fail.
         */
        ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-                                OCFS2_BH_IGNORE_CACHE);
+                                OCFS2_BH_IGNORE_CACHE, NULL);
        if (ret == 0) {
                spin_lock(&osb->osb_lock);
                ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
                bh = NULL;  /* Acquire a fresh bh */
                status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                           OCFS2_BH_IGNORE_CACHE, NULL);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
-/* somewhat more expensive than our other checks, so use sparingly. */
+#define do_error(fmt, ...)                                              \
-int ocfs2_check_group_descriptor(struct super_block *sb,
+        do{                                                             \
-                                 struct ocfs2_dinode *di,
+                if (clean_error)                                        \
-                                 struct ocfs2_group_desc *gd)
+                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
+                else                                                    \
+                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
+        } while (0)
+static int ocfs2_validate_gd_self(struct super_block *sb,
+                                  struct buffer_head *bh,
+                                  int clean_error)
 {
-        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
+                do_error("Group descriptor #%llu has bad signature %.*s",
-                return -EIO;
+                         (unsigned long long)bh->b_blocknr, 7,
+                         gd->bg_signature);
+                return -EINVAL;
        }
+        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+                do_error("Group descriptor #%llu has an invalid bg_blkno "
+                         "of %llu",
+                         (unsigned long long)bh->b_blocknr,
+                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+                do_error("Group descriptor #%llu has an invalid "
+                         "fs_generation of #%u",
+                         (unsigned long long)bh->b_blocknr,
+                         le32_to_cpu(gd->bg_generation));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "claims that %u are free",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         le16_to_cpu(gd->bg_free_bits_count));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "max bitmap bits of %u",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         8 * le16_to_cpu(gd->bg_size));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+                                    struct ocfs2_dinode *di,
+                                    struct buffer_head *bh,
+                                    int clean_error)
+{
+        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (di->i_blkno != gd->bg_parent_dinode) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad parent "
+                do_error("Group descriptor #%llu has bad parent "
-                            "pointer (%llu, expected %llu)",
+                         "pointer (%llu, expected %llu)",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                         (unsigned long long)le64_to_cpu(di->i_blkno));
-                return -EIO;
+                return -EINVAL;
        }
        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
        if (le16_to_cpu(gd->bg_bits) > max_bits) {
-                ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
+                do_error("Group descriptor #%llu has bit count of %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_bits));
+                         le16_to_cpu(gd->bg_bits));
-                return -EIO;
+                return -EINVAL;
        }
        if (le16_to_cpu(gd->bg_chain) >=
            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
+                do_error("Group descriptor #%llu has bad chain %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_chain));
+                         le16_to_cpu(gd->bg_chain));
-                return -EIO;
+                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+        return 0;
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
+}
-                            "claims that %u are free",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                            le16_to_cpu(gd->bg_bits),
-                            le16_to_cpu(gd->bg_free_bits_count));
-                return -EIO;
-        }
-        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+#undef do_error
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-                            "max bitmap bits of %u",
+/*
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ * This version only prints errors.  It does not fail the filesystem, and
-                            le16_to_cpu(gd->bg_bits),
+ * exists only for resize.
-                            8 * le16_to_cpu(gd->bg_size));
+ */
-                return -EIO;
+int ocfs2_check_group_descriptor(struct super_block *sb,
+                                 struct ocfs2_dinode *di,
+                                 struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc) {
+                mlog(ML_ERROR,
+                     "Checksum failed for group descriptor %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        } else
+                rc = ocfs2_validate_gd_self(sb, bh, 1);
+        if (!rc)
+                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+        return rc;
+}
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+                                           struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        mlog(0, "Validating group descriptor %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal.
+         */
+        return ocfs2_validate_gd_self(sb, bh, 0);
+}
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+                              ocfs2_validate_group_descriptor);
+        if (rc)
+                goto out;
+        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+        if (rc) {
+                brelse(tmp);
+                goto out;
        }
-        return 0;
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc;
 }
 static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      bg_bh,
+                                         bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        status = ocfs2_journal_access(handle, alloc_inode,
+        status = ocfs2_journal_access_di(handle, alloc_inode,
-                                      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_alloc_slot = slot;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+        /* The bh was validated by the inode read inside
-                status = -EIO;
+         * ocfs2_inode_lock().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
                            (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
        int offset, start, found, status = 0;
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* Callers got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                return -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        }
        found = start = best_offset = best_size = 0;
        bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* All callers get the descriptor via
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      group_bh,
+                                         group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The caller got these descriptors from
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-                status = -EIO;
-                goto out;
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-                status = -EIO;
-                goto out;
-        }
        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        bg_ptr = le64_to_cpu(bg->bg_next_group);
        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
-        status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
                bg->bg_next_group = cpu_to_le64(bg_ptr);
                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
        }
-out:
        mlog_exit(status);
        return status;
 }
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-                ret = -EIO;
-                goto out;
-        }
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
                                  ac->ac_max_block, bit_off, &found);
        if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
             bits_wanted, chain,
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
-        status = ocfs2_read_block(alloc_inode,
+        status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
-                                  &group_bh);
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        bg = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = -ENOSPC;
        /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                next_group = le64_to_cpu(bg->bg_next_group);
                prev_group_bh = group_bh;
                group_bh = NULL;
-                status = ocfs2_read_block(alloc_inode,
+                status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                          next_group, &group_bh);
+                                                     next_group, &group_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                bg = (struct ocfs2_group_desc *) group_bh->b_data;
-                status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        }
        if (status < 0) {
                if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        /* Ok, claim our bits now: set the info on dinode, chainlist
         * and then the group */
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_di(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      ac->ac_bh,
+                                         ac->ac_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        BUG_ON(!ac->ac_bh);
        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+        /* The bh was validated by the inode read during
-                status = -EIO;
+         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* The caller got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The alloc_bh comes from ocfs2_free_dinode() or
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_free_clusters().  The callers have all locked the
-                status = -EIO;
+         * allocator and gotten alloc_bh from the lock call.  This
-                goto bail;
+         * validates the dinode buffer.  Any corruption that has happended
-        }
+         * is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
        mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
             (unsigned long long)bg_blkno, start_bit);
-        status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        group = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
 int ocfs2_check_group_descriptor(struct super_block *sb,
                                 struct ocfs2_dinode *di,
-                                 struct ocfs2_group_desc *gd);
+                                 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..b1cb38fbe807 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
@@ -65,10 +67,13 @@
 #include "uptodate.h"
 #include "ver.h"
 #include "xattr.h"
+#include "quota.h"
 #include "buffer_head_io.h"
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 /* OCFS2 needs to schedule several differnt types of work which
 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 static const struct super_operations ocfs2_sops = {
        .statfs         = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
+        .quota_read     = ocfs2_quota_read,
+        .quota_write    = ocfs2_quota_write,
 };
 enum {
@@ -158,6 +168,10 @@ enum {
        Opt_user_xattr,
        Opt_nouser_xattr,
        Opt_inode64,
+        Opt_acl,
+        Opt_noacl,
+        Opt_usrquota,
+        Opt_grpquota,
        Opt_err,
 };
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_inode64, "inode64"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_usrquota, "usrquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_err, NULL}
 };
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
        return 0;
 }
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+            && (ino == USER_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+                return 0;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+            && (ino == GROUP_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+                return 0;
+        return 1;
+}
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
        struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
             i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
             i < NUM_SYSTEM_INODES;
             i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        /* We're going to/from readonly mode. */
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+                /* Disable quota accounting before remounting RO */
+                if (*flags & MS_RDONLY) {
+                        ret = ocfs2_susp_quotas(osb, 0);
+                        if (ret < 0)
+                                goto out;
+                }
                /* Lock here so the check of HARD_RO and the potential
                 * setting of SOFT_RO is atomic. */
                spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                }
 unlock_osb:
                spin_unlock(&osb->osb_lock);
+                /* Enable quota accounting after remounting RW */
+                if (!ret && !(*flags & MS_RDONLY)) {
+                        if (sb_any_quota_suspended(sb))
+                                ret = ocfs2_susp_quotas(osb, 1);
+                        else
+                                ret = ocfs2_enable_quotas(osb);
+                        if (ret < 0) {
+                                /* Return back changes... */
+                                spin_lock(&osb->osb_lock);
+                                sb->s_flags |= MS_RDONLY;
+                                osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+                                spin_unlock(&osb->osb_lock);
+                                goto out;
+                        }
+                }
        }
        if (!ret) {
                /* Only save off the new mount options in case of a successful
                 * remount. */
+                if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                        parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
        return 0;
 }
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+        int type;
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        int status = 0;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                if (unsuspend)
+                        status = vfs_quota_enable(
+                                        sb_dqopt(sb)->files[type],
+                                        type, QFMT_OCFS2,
+                                        DQUOT_SUSPENDED);
+                else
+                        status = vfs_quota_disable(sb, type,
+                                                   DQUOT_SUSPENDED);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+                     "remount (error = %d).\n", status);
+        return status;
+}
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+        struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        int status;
+        int type;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+                                                        osb->slot_num);
+                if (!inode[type]) {
+                        status = -ENOENT;
+                        goto out_quota_off;
+                }
+                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                                                DQUOT_USAGE_ENABLED);
+                if (status < 0)
+                        goto out_quota_off;
+        }
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        return 0;
+out_quota_off:
+        ocfs2_disable_quotas(osb);
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        mlog_errno(status);
+        return status;
+}
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+        int type;
+        struct inode *inode;
+        struct super_block *sb = osb->sb;
+        /* We mostly ignore errors in this function because there's not much
+         * we can do when we see them */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!sb_has_quota_loaded(sb, type))
+                        continue;
+                inode = igrab(sb->s_dquot.files[type]);
+                /* Turn off quotas. This will remove all dquot structures from
+                 * memory and so they will be automatically synced to global
+                 * quota files */
+                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                                            DQUOT_LIMITS_ENABLED);
+                if (!inode)
+                        continue;
+                iput(inode);
+        }
+}
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+                          char *path, int remount)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                return -EINVAL;
+        if (remount)
+                return 0;       /* Just ignore it has been handled in
+                                 * ocfs2_remount() */
+        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+                                    format_id, DQUOT_LIMITS_ENABLED);
+}
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+        if (remount)
+                return 0;       /* Ignore now and handle later in
+                                 * ocfs2_remount() */
+        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+static struct quotactl_ops ocfs2_quotactl_ops = {
+        .quota_on       = ocfs2_quota_on,
+        .quota_off      = ocfs2_quota_off,
+        .quota_sync     = vfs_quota_sync,
+        .get_info       = vfs_get_dqinfo,
+        .set_info       = vfs_set_dqinfo,
+        .get_dqblk      = vfs_get_dqblk,
+        .set_dqblk      = vfs_set_dqblk,
+};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        }
        brelse(bh);
        bh = NULL;
+        if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "User quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "Group quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = OCFS2_SUPER_MAGIC;
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
         * heartbeat=none */
        if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        atomic_set(&osb->vol_state, VOLUME_MOUNTED);
        wake_up(&osb->osb_mount_event);
+        /* Now we can initialize quotas because we can afford to wait
+         * for cluster locks recovery now. That also means that truncation
+         * log recovery can happen but that waits for proper quota setup */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                status = ocfs2_enable_quotas(osb);
+                if (status < 0) {
+                        /* We have to err-out specially here because
+                         * s_root is already set */
+                        mlog_errno(status);
+                        atomic_set(&osb->vol_state, VOLUME_DISABLED);
+                        wake_up(&osb->osb_mount_event);
+                        mlog_exit(status);
+                        return status;
+                }
+        }
+        ocfs2_complete_quota_recovery(osb);
+        /* Now we wake up again for processes waiting for quotas */
+        atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+        wake_up(&osb->osb_mount_event);
        mlog_exit(status);
        return status;
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
                        break;
+                case Opt_usrquota:
+                        /* We check only on remount, otherwise features
+                         * aren't yet initialized. */
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                                mlog(ML_ERROR, "User quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+                        break;
+                case Opt_grpquota:
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                                mlog(ML_ERROR, "Group quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+                        break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+                case Opt_acl:
+                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                        break;
+                case Opt_noacl:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+                        break;
+#else
+                case Opt_acl:
+                case Opt_noacl:
+                        printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+                        break;
+#endif
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->osb_cluster_stack[0])
                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
                           osb->osb_cluster_stack);
+        if (opts & OCFS2_MOUNT_USRQUOTA)
+                seq_printf(s, ",usrquota");
+        if (opts & OCFS2_MOUNT_GRPQUOTA)
+                seq_printf(s, ",grpquota");
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_INODE64)
                seq_printf(s, ",inode64");
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        if (opts & OCFS2_MOUNT_POSIX_ACL)
+                seq_printf(s, ",acl");
+        else
+                seq_printf(s, ",noacl");
+#endif
        return 0;
 }
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
+        status = ocfs2_quota_setup();
+        if (status)
+                goto leave;
        ocfs2_set_locking_protocol();
+        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
+                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
+        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
        }
+        unregister_quota_format(&ocfs2_quota_format);
        debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
                                       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                                SLAB_MEM_SPREAD),
                                       ocfs2_inode_init_once);
-        if (!ocfs2_inode_cachep)
+        ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+                                        sizeof(struct ocfs2_dquot),
+                                        0,
+                                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+                                                SLAB_MEM_SPREAD),
+                                        NULL);
+        ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+                                        sizeof(struct ocfs2_quota_chunk),
+                                        0,
+                                        (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                        NULL);
+        if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+            !ocfs2_qf_chunk_cachep) {
+                if (ocfs2_inode_cachep)
+                        kmem_cache_destroy(ocfs2_inode_cachep);
+                if (ocfs2_dquot_cachep)
+                        kmem_cache_destroy(ocfs2_dquot_cachep);
+                if (ocfs2_qf_chunk_cachep)
+                        kmem_cache_destroy(ocfs2_qf_chunk_cachep);
                return -ENOMEM;
+        }
        return 0;
 }
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
 {
        if (ocfs2_inode_cachep)
                kmem_cache_destroy(ocfs2_inode_cachep);
        ocfs2_inode_cachep = NULL;
+        if (ocfs2_dquot_cachep)
+                kmem_cache_destroy(ocfs2_dquot_cachep);
+        ocfs2_dquot_cachep = NULL;
+        if (ocfs2_qf_chunk_cachep)
+                kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+        ocfs2_qf_chunk_cachep = NULL;
 }
 static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
        ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
+        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->dq_op = &ocfs2_quota_operations;
        sb->s_xattr = ocfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
@@ -1573,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
        journal->j_state = OCFS2_JOURNAL_FREE;
+        INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+        osb->dentry_lock_list = NULL;
        /* get some pseudo constants for clustersize bits */
        osb->s_clustersize_bits =
                le32_to_cpu(di->id2.i_super.s_clustersize_bits);
@@ -1676,6 +1993,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
        if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
                   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+                /* We have to do a raw check of the feature here */
+                if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+                    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+                        status = ocfs2_block_check_validate(bh->b_data,
+                                                            bh->b_size,
+                                                            &di->i_check);
+                        if (status)
+                                goto out;
+                }
                status = -EINVAL;
                if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
                        mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2043,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                }
        }
+out:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        mlog_entry_void();
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+        status = ocfs2_read_inode_block(inode, bh);
        if (status < 0) {
                mlog_errno(status);
                link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..915039fffe6e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/security.h>
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
 };
 struct ocfs2_xattr_bucket {
-        struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* The inode these xattrs are associated with */
-        struct ocfs2_xattr_header *xh;
+        struct inode *bu_inode;
+        /* The actual buffers that make up the bucket */
+        struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* How many blocks make up one bucket for this filesystem */
+        int bu_blocks;
+};
+struct ocfs2_xattr_set_ctxt {
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_FREE_IN_IBODY       (OCFS2_MIN_XATTR_INLINE_SIZE \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)  ((ptr)->i_sb->s_blocksize \
+                                         - sizeof(struct ocfs2_xattr_block) \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
 static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        &ocfs2_xattr_acl_access_handler,
+        &ocfs2_xattr_acl_default_handler,
+#endif
        &ocfs2_xattr_trusted_handler,
+        &ocfs2_xattr_security_handler,
        NULL
 };
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+                                        = &ocfs2_xattr_acl_access_handler,
+        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+                                        = &ocfs2_xattr_acl_default_handler,
+#endif
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
+        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
 struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
         */
        struct buffer_head *xattr_bh;
        struct ocfs2_xattr_header *header;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
        void *base;
        void *end;
        struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
                                        size_t buffer_size);
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs);
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs);
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
                                          struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
+                                  unsigned int start_bucket,
+                                  u32 *first_hash);
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
        return len / sizeof(struct ocfs2_xattr_entry);
 }
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+        struct ocfs2_xattr_bucket *bucket;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+        bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+        if (bucket) {
+                bucket->bu_inode = inode;
+                bucket->bu_blocks = blks;
+        }
+        return bucket;
+}
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                brelse(bucket->bu_bhs[i]);
+                bucket->bu_bhs[i] = NULL;
+        }
+}
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+        if (bucket) {
+                ocfs2_xattr_bucket_relse(bucket);
+                bucket->bu_inode = NULL;
+                kfree(bucket);
+        }
+}
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+                                              xb_blkno + i);
+                if (!bucket->bu_bhs[i]) {
+                        rc = -EIO;
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+                                           bucket->bu_bhs[i]))
+                        ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+                                                      bucket->bu_bhs[i]);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int rc;
+        rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+                               bucket->bu_blocks, bucket->bu_bhs, 0,
+                               NULL);
+        if (!rc) {
+                rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                                 bucket->bu_bhs,
+                                                 bucket->bu_blocks,
+                                                 &bucket_xh(bucket)->xh_check);
+                if (rc)
+                        mlog_errno(rc);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket,
+                                             int type)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                rc = ocfs2_journal_access(handle, bucket->bu_inode,
+                                          bucket->bu_bhs[i], type);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+        }
+        return rc;
+}
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                   bucket->bu_bhs, bucket->bu_blocks,
+                                   &bucket_xh(bucket)->xh_check);
+        for (i = 0; i < bucket->bu_blocks; i++)
+                ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+                                         struct ocfs2_xattr_bucket *src)
+{
+        int i;
+        int blocksize = src->bu_inode->i_sb->s_blocksize;
+        BUG_ON(dest->bu_blocks != src->bu_blocks);
+        BUG_ON(dest->bu_inode != src->bu_inode);
+        for (i = 0; i < src->bu_blocks; i++) {
+                memcpy(bucket_block(dest, i), bucket_block(src, i),
+                       blocksize);
+        }
+}
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_xattr_block *xb =
+                (struct ocfs2_xattr_block *)bh->b_data;
+        mlog(0, "Validating xattr block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal
+         */
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has bad "
+                            "signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            xb->xb_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an "
+                            "invalid xb_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(xb->xb_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an invalid "
+                            "xb_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(xb->xb_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+                                  struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+                              ocfs2_validate_xattr_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
        struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
        return;
 }
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+        int size = 0;
+        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+        else
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        size += sizeof(struct ocfs2_xattr_entry);
+        return size;
+}
+int ocfs2_calc_security_init(struct inode *dir,
+                             struct ocfs2_security_xattr_info *si,
+                             int *want_clusters,
+                             int *xattr_credits,
+                             struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                 si->value_len);
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * So reserve one metadata block for it is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        /* reserve clusters for xattr value which will be set in B tree*/
+        if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                            si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
+int ocfs2_calc_xattr_init(struct inode *dir,
+                          struct buffer_head *dir_bh,
+                          int mode,
+                          struct ocfs2_security_xattr_info *si,
+                          int *want_clusters,
+                          int *xattr_credits,
+                          struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+        if (si->enable)
+                s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                     si->value_len);
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+                                        OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+                                        "", NULL, 0);
+                if (acl_len > 0) {
+                        a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+                        if (S_ISDIR(mode))
+                                a_size <<= 1;
+                } else if (acl_len != 0 && acl_len != -ENODATA) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+        }
+        if (!(s_size + a_size))
+                return ret;
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * The max space of acl xattr taken inline is
+         * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+         * when blocksize = 512, may reserve one more cluser for
+         * xattr bucket, otherwise reserve one metadata block
+         * for them is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+                *want_clusters += 1;
+                *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+        }
+        /*
+         * reserve credits and clusters for xattrs which has large value
+         * and have to be set outside
+         */
+        if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                        si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+            acl_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* for directory, it has DEFAULT and ACCESS two types of acls */
+                new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+                                ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         u32 clusters_to_add,
-                                         struct buffer_head *xattr_bh,
+                                         struct ocfs2_xattr_value_buf *vb,
-                                         struct ocfs2_xattr_value_root *xv)
+                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int status = 0;
-        int restart_func = 0;
+        handle_t *handle = ctxt->handle;
-        int credits = 0;
-        handle_t *handle = NULL;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
        struct ocfs2_extent_tree et;
        mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-        ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-restart_all:
-        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-                                       &data_ac, &meta_ac);
-        if (status) {
-                mlog_errno(status);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto leave;
-        }
-restarted_transaction:
+        status = vb->vb_access(handle, inode, vb->vb_bh,
-        status = ocfs2_journal_access(handle, inode, xattr_bh,
+                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        prev_clusters = le32_to_cpu(xv->xr_clusters);
+        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        status = ocfs2_add_clusters_in_btree(osb,
                                             inode,
                                             &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
                                             0,
                                             &et,
                                             handle,
-                                             data_ac,
+                                             ctxt->data_ac,
-                                             meta_ac,
+                                             ctxt->meta_ac,
                                             &why);
-        if ((status < 0) && (status != -EAGAIN)) {
+        if (status < 0) {
-                if (status != -ENOSPC)
+                mlog_errno(status);
-                        mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, xattr_bh);
+        status = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
-        if (why != RESTART_NONE && clusters_to_add) {
+        /*
-                if (why == RESTART_META) {
+         * We should have already allocated enough space before the transaction,
-                        mlog(0, "restarting function.\n");
+         * so no need to restart.
-                        restart_func = 1;
+         */
-                } else {
+        BUG_ON(why != RESTART_NONE || clusters_to_add);
-                        BUG_ON(why != RESTART_TRANS);
-                        mlog(0, "restarting transaction.\n");
-                        /* TODO: This can be more intelligent. */
-                        credits = ocfs2_calc_extend_credits(osb->sb,
-                                                            et.et_root_el,
-                                                            clusters_to_add);
-                        status = ocfs2_extend_trans(handle, credits);
-                        if (status < 0) {
-                                /* handle still has to be committed at
-                                 * this point. */
-                                status = -ENOMEM;
-                                mlog_errno(status);
-                                goto leave;
-                        }
-                        goto restarted_transaction;
-                }
-        }
 leave:
-        if (handle) {
-                ocfs2_commit_trans(osb, handle);
-                handle = NULL;
-        }
-        if (data_ac) {
-                ocfs2_free_alloc_context(data_ac);
-                data_ac = NULL;
-        }
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if ((!status) && restart_func) {
-                restart_func = 0;
-                goto restart_all;
-        }
        return status;
 }
 static int __ocfs2_remove_xattr_range(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = ctxt->handle;
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_extent_tree et;
-        ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
+                            OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                  &ctxt->dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        le32_add_cpu(&xv->xr_clusters, -len);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
        if (ret)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 static int ocfs2_xattr_shrink_size(struct inode *inode,
                                   u32 old_clusters,
                                   u32 new_clusters,
-                                   struct buffer_head *root_bh,
+                                   struct ocfs2_xattr_value_buf *vb,
-                                   struct ocfs2_xattr_value_root *xv)
+                                   struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret = 0;
        u32 trunc_len, cpos, phys_cpos, alloc_size;
        u64 block;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_cached_dealloc_ctxt dealloc;
-        ocfs2_init_dealloc_ctxt(&dealloc);
        if (old_clusters <= new_clusters)
                return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        trunc_len = old_clusters - new_clusters;
        while (trunc_len) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
-                                               &alloc_size, &xv->xr_list);
+                                               &alloc_size,
+                                               &vb->vb_xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
                if (alloc_size > trunc_len)
                        alloc_size = trunc_len;
-                ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+                ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
                                                 phys_cpos, alloc_size,
-                                                 &dealloc);
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        }
 out:
-        ocfs2_schedule_truncate_log_flush(osb, 1);
-        ocfs2_run_deallocs(osb, &dealloc);
        return ret;
 }
 static int ocfs2_xattr_value_truncate(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
+                                      int len,
-                                      int len)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
-        u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+        u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        if (new_clusters == old_clusters)
                return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
        if (new_clusters > old_clusters)
                ret = ocfs2_xattr_extend_allocation(inode,
                                                    new_clusters - old_clusters,
-                                                    root_bh, xv);
+                                                    vb, ctxt);
        else
                ret = ocfs2_xattr_shrink_size(inode,
                                              old_clusters, new_clusters,
-                                              root_bh, xv);
+                                              vb, ctxt);
        return ret;
 }
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
                ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
                ret = ocfs2_xattr_tree_list_index_block(inode, xt,
                                                   buffer, buffer_size);
        }
-cleanup:
        brelse(blk_bh);
        return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                /* Copy ocfs2_xattr_value */
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        size_t size;
        int ret = -ENODATA, name_offset, name_len, block_off, i;
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
+        xs->bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xs->bucket) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto cleanup;
+        }
        ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
        if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                xs->bucket.xh,
+                                                                bucket_xh(xs->bucket),
                                                                i,
                                                                &block_off,
                                                                &name_offset);
-                        xs->base = xs->bucket.bhs[block_off]->b_data;
+                        xs->base = bucket_block(xs->bucket, block_off);
                }
                if (ocfs2_xattr_is_local(xs->here)) {
                        memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        }
        ret = size;
 cleanup:
-        for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+        ocfs2_xattr_bucket_free(xs->bucket);
-                brelse(xs->bucket.bhs[i]);
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
        brelse(xs->xattr_bh);
        xs->xattr_bh = NULL;
        return ret;
 }
-/* ocfs2_xattr_get()
+int ocfs2_xattr_get_nolock(struct inode *inode,
- *
+                           struct buffer_head *di_bh,
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
                           int name_index,
                           const char *name,
                           void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 {
        int ret;
        struct ocfs2_dinode *di = NULL;
-        struct buffer_head *di_bh = NULL;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_xattr_search xis = {
                .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
                ret = -ENODATA;
-        ret = ocfs2_inode_lock(inode, &di_bh, 0);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
        up_read(&oi->ip_xattr_sem);
+        return ret;
+}
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+                           int name_index,
+                           const char *name,
+                           void *buffer,
+                           size_t buffer_size)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                     name, buffer, buffer_size);
        ocfs2_inode_unlock(inode, 0);
        brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
 }
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_value_root *xv,
                                           const void *value,
                                           int value_len)
 {
-        int ret = 0, i, cp_len, credits;
+        int ret = 0, i, cp_len;
        u16 blocksize = inode->i_sb->s_blocksize;
        u32 p_cluster, num_clusters;
        u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
        u64 blkno;
        struct buffer_head *bh = NULL;
-        handle_t *handle;
        BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
-        credits = clusters * bpc;
-        handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
        while (cpos < clusters) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
                                               &num_clusters, &xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                        ret = ocfs2_journal_dirty(handle, bh);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        brelse(bh);
                        bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                }
                cpos += num_clusters;
        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        brelse(bh);
@@ -960,28 +1249,22 @@ out:
 }
 static int ocfs2_xattr_cleanup(struct inode *inode,
+                               handle_t *handle,
                               struct ocfs2_xattr_info *xi,
                               struct ocfs2_xattr_search *xs,
+                               struct ocfs2_xattr_value_buf *vb,
                               size_t offs)
 {
-        handle_t *handle = NULL;
        int ret = 0;
        size_t name_len = strlen(xi->name);
        void *val = xs->base + offs;
        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /* Decrease xattr count */
        le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
        memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
        memset(val, 0, size);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
 static int ocfs2_xattr_update_entry(struct inode *inode,
+                                    handle_t *handle,
                                    struct ocfs2_xattr_info *xi,
                                    struct ocfs2_xattr_search *xs,
+                                    struct ocfs2_xattr_value_buf *vb,
                                    size_t offs)
 {
-        handle_t *handle = NULL;
+        int ret;
-        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
                ocfs2_xattr_set_local(xs->here, 0);
        ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
@@ -1045,6 +1318,8 @@ out:
 static int ocfs2_xattr_set_value_outside(struct inode *inode,
                                         struct ocfs2_xattr_info *xi,
                                         struct ocfs2_xattr_search *xs,
+                                         struct ocfs2_xattr_set_ctxt *ctxt,
+                                         struct ocfs2_xattr_value_buf *vb,
                                         size_t offs)
 {
        size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
        xv->xr_list.l_tree_depth = 0;
        xv->xr_list.l_count = cpu_to_le16(1);
        xv->xr_list.l_next_free_rec = 0;
+        vb->vb_xv = xv;
-        ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+        ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
-                                         xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+        ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-                                              xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+        ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+                                              xi->value, xi->value_len);
        if (ret < 0)
                mlog_errno(ret);
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 static int ocfs2_xattr_set_entry(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt,
                                 int flag)
 {
        struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
        size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
        size_t size_l = 0;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
        int free, i, ret;
        struct ocfs2_xattr_info xi_l = {
                .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                .value = xi->value,
                .value_len = xi->value_len,
        };
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = xs->xattr_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
+        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+                BUG_ON(xs->xattr_bh == xs->inode_bh);
+                vb.vb_access = ocfs2_journal_access_xb;
+        } else
+                BUG_ON(xs->xattr_bh != xs->inode_bh);
        /* Compute min_offs, last and free space. */
        last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
                        /* Replace existing local xattr with tree root */
                        ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-                                                            offs);
+                                                            ctxt, &vb, offs);
                        if (ret < 0)
                                mlog_errno(ret);
                        goto out;
                } else if (!ocfs2_xattr_is_local(xs->here)) {
                        /* For existing xattr which has value outside */
-                        struct ocfs2_xattr_value_root *xv = NULL;
+                        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                        xv = (struct ocfs2_xattr_value_root *)(val +
+                                (val + OCFS2_XATTR_SIZE(name_len));
-                                OCFS2_XATTR_SIZE(name_len));
                        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                                /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * then set new value with set_value_outside().
                                 */
                                ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                 &vb,
-                                                                 xv,
+                                                                 xi->value_len,
-                                                                 xi->value_len);
+                                                                 ctxt);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = __ocfs2_xattr_set_value_outside(inode,
+                                ret = ocfs2_xattr_update_entry(inode,
-                                                                xv,
+                                                               handle,
-                                                                xi->value,
+                                                               xi,
-                                                                xi->value_len);
+                                                               xs,
+                                                               &vb,
+                                                               offs);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = ocfs2_xattr_update_entry(inode,
+                                ret = __ocfs2_xattr_set_value_outside(inode,
-                                                               xi,
+                                                                handle,
-                                                               xs,
+                                                                vb.vb_xv,
-                                                               offs);
+                                                                xi->value,
+                                                                xi->value_len);
                                if (ret < 0)
                                        mlog_errno(ret);
                                goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * just trucate old value to zero.
                                 */
                                 ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                  &vb,
-                                                                 xv,
+                                                                  0,
-                                                                 0);
+                                                                  ctxt);
                                if (ret < 0)
                                        mlog_errno(ret);
                        }
                }
        }
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                                   OCFS2_INODE_UPDATE_CREDITS);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-                /* set extended attribute in external block. */
+                ret = vb.vb_access(handle, inode, vb.vb_bh,
-                ret = ocfs2_extend_trans(handle,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
-                                         OCFS2_INODE_UPDATE_CREDITS +
-                                         OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        oi->ip_dyn_features |= flag;
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        /* Update inode ctime */
-        inode->i_ctime = CURRENT_TIME;
-        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ret = ocfs2_journal_dirty(handle, xs->inode_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
        if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                /*
                 * Set value outside in B tree.
                 * This is the second step for value size > INLINE_SIZE.
                 */
                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+                                                    &vb, offs);
                if (ret < 0) {
                        int ret2;
@@ -1418,41 +1684,56 @@ out_commit:
                         * If set value outside failed, we have to clean
                         * the junk tree root we have already set in local.
                         */
-                        ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+                        ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+                                                   xi, xs, &vb, offs);
                        if (ret2 < 0)
                                mlog_errno(ret2);
                }
        }
 out:
        return ret;
 }
 static int ocfs2_remove_value_outside(struct inode*inode,
-                                      struct buffer_head *bh,
+                                      struct ocfs2_xattr_value_buf *vb,
                                      struct ocfs2_xattr_header *header)
 {
        int ret = 0, i;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+        ctxt.handle = ocfs2_start_trans(osb,
+                                        ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto out;
+        }
        for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
                struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
                if (!ocfs2_xattr_is_local(entry)) {
-                        struct ocfs2_xattr_value_root *xv;
                        void *val;
                        val = (void *)header +
                                le16_to_cpu(entry->xe_name_offset);
-                        xv = (struct ocfs2_xattr_value_root *)
+                        vb->vb_xv = (struct ocfs2_xattr_value_root *)
                                (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-                        ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+                        ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                return ret;
+                                break;
                        }
                }
        }
+        ocfs2_commit_trans(osb, ctxt.handle);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
        return ret;
 }
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_xattr_header *header;
        int ret;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = di_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
        header = (struct ocfs2_xattr_header *)
                 ((void *)di + inode->i_sb->s_blocksize -
                 le16_to_cpu(di->i_xattr_inline_size));
-        ret = ocfs2_remove_value_outside(inode, di_bh, header);
+        ret = ocfs2_remove_value_outside(inode, &vb, header);
        return ret;
 }
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 {
        struct ocfs2_xattr_block *xb;
        int ret = 0;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = blk_bh,
+                .vb_access = ocfs2_journal_access_xb,
+        };
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-                ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+                ret = ocfs2_remove_value_outside(inode, &vb, header);
        } else
                ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        u64 blk, bg_blkno;
        u16 bit;
-        ret = ocfs2_read_block(inode, block, &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto out;
-        }
        ret = ocfs2_xattr_block_remove(inode, blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
 */
 static int ocfs2_xattr_ibody_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
                }
        }
-        ret = ocfs2_xattr_set_entry(inode, xi, xs,
+        ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
                                (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
 out:
        up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        xs->xattr_bh = blk_bh;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
 */
 static int ocfs2_xattr_block_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct buffer_head *new_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = ctxt->handle;
-        handle_t *handle = NULL;
        struct ocfs2_xattr_block *xblk = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
        int ret;
        if (!xs->xattr_bh) {
-                /*
+                ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                 * Alloc one external block for extended attribute
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
-                 * outside of inode.
-                 */
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out;
+                        goto end;
-                }
-                handle = ocfs2_start_trans(osb,
-                                           OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out_commit;
                }
-                ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+                ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
                                           &suballoc_bit_start, &num_got,
                                           &first_blkno);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                new_bh = sb_getblk(inode->i_sb, first_blkno);
                ocfs2_set_new_buffer_uptodate(inode, new_bh);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                ret = ocfs2_journal_access_xb(handle, inode, new_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                xs->end = (void *)xblk + inode->i_sb->s_blocksize;
                xs->here = xs->header->xh_entries;
                ret = ocfs2_journal_dirty(handle, new_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                di->i_xattr_loc = cpu_to_le64(first_blkno);
-                ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+                ocfs2_journal_dirty(handle, xs->inode_bh);
-                if (ret < 0)
-                        mlog_errno(ret);
-out_commit:
-                ocfs2_commit_trans(osb, handle);
-out:
-                if (meta_ac)
-                        ocfs2_free_alloc_context(meta_ac);
-                if (ret < 0)
-                        return ret;
        } else
                xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
        if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
                /* Set extended attribute into external block */
-                ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+                ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+                                            OCFS2_HAS_XATTR_FL);
                if (!ret || ret != -ENOSPC)
                        goto end;
-                ret = ocfs2_xattr_create_index_block(inode, xs);
+                ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
                if (ret)
                        goto end;
        }
-        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 end:
        return ret;
 }
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+                                       struct ocfs2_xattr_info *xi,
+                                       struct ocfs2_xattr_search *xs)
+{
+        u64 value_size;
+        struct ocfs2_xattr_entry *last;
+        int free, i;
+        size_t min_offs = xs->end - xs->base;
+        if (!xs->header)
+                return 0;
+        last = xs->header->xh_entries;
+        for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+                size_t offs = le16_to_cpu(last->xe_name_offset);
+                if (offs < min_offs)
+                        min_offs = offs;
+                last += 1;
+        }
+        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        if (free < 0)
+                return 0;
+        BUG_ON(!xs->not_found);
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                value_size = OCFS2_XATTR_ROOT_SIZE;
+        else
+                value_size = OCFS2_XATTR_SIZE(xi->value_len);
+        if (free >= sizeof(struct ocfs2_xattr_entry) +
+                   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+                return 1;
+        return 0;
+}
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     int *clusters_need,
+                                     int *meta_need,
+                                     int *credits_need)
+{
+        int ret = 0, old_in_xb = 0;
+        int clusters_add = 0, meta_add = 0, credits = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_block *xb = NULL;
+        struct ocfs2_xattr_entry *xe = NULL;
+        struct ocfs2_xattr_value_root *xv = NULL;
+        char *base = NULL;
+        int name_offset, name_len = 0;
+        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+                                                    xi->value_len);
+        u64 value_size;
+        /*
+         * Calculate the clusters we need to write.
+         * No matter whether we replace an old one or add a new one,
+         * we need this for writing.
+         */
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                credits += new_clusters *
+                           ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        if (xis->not_found && xbs->not_found) {
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                }
+                goto meta_guess;
+        }
+        if (!xis->not_found) {
+                xe = xis->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                base = xis->base;
+                credits += OCFS2_INODE_UPDATE_CREDITS;
+        } else {
+                int i, block_off = 0;
+                xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                xe = xbs->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                i = xbs->here - xbs->header->xh_entries;
+                old_in_xb = 1;
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                        bucket_xh(xbs->bucket),
+                                                        i, &block_off,
+                                                        &name_offset);
+                        base = bucket_block(xbs->bucket, block_off);
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                } else {
+                        base = xbs->base;
+                        credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+                }
+        }
+        /*
+         * delete a xattr doesn't need metadata and cluster allocation.
+         * so just calculate the credits and return.
+         *
+         * The credits for removing the value tree will be extended
+         * by ocfs2_remove_extent itself.
+         */
+        if (!xi->value) {
+                if (!ocfs2_xattr_is_local(xe))
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                goto out;
+        }
+        /* do cluster allocation guess first. */
+        value_size = le64_to_cpu(xe->xe_value_size);
+        if (old_in_xb) {
+                /*
+                 * In xattr set, we always try to set the xe in inode first,
+                 * so if it can be inserted into inode successfully, the old
+                 * one will be removed from the xattr block, and this xattr
+                 * will be inserted into inode as a new xattr in inode.
+                 */
+                if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_remove_extent_credits(inode->i_sb) +
+                                    OCFS2_INODE_UPDATE_CREDITS;
+                        if (!ocfs2_xattr_is_local(xe))
+                                credits += ocfs2_calc_extend_credits(
+                                                        inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                        goto out;
+                }
+        }
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* the new values will be stored outside. */
+                u32 old_clusters = 0;
+                if (!ocfs2_xattr_is_local(xe)) {
+                        old_clusters =  ocfs2_clusters_for_bytes(inode->i_sb,
+                                                                 value_size);
+                        xv = (struct ocfs2_xattr_value_root *)
+                             (base + name_offset + name_len);
+                        value_size = OCFS2_XATTR_ROOT_SIZE;
+                } else
+                        xv = &def_xv.xv;
+                if (old_clusters >= new_clusters) {
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                        goto out;
+                } else {
+                        meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+                        clusters_add += new_clusters - old_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             &xv->xr_list,
+                                                             new_clusters -
+                                                             old_clusters);
+                        if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+                                goto out;
+                }
+        } else {
+                /*
+                 * Now the new value will be stored inside. So if the new
+                 * value is smaller than the size of value root or the old
+                 * value, we don't need any allocation, otherwise we have
+                 * to guess metadata allocation.
+                 */
+                if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+                    (!ocfs2_xattr_is_local(xe) &&
+                     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+                        goto out;
+        }
+meta_guess:
+        /* calculate metadata allocation. */
+        if (di->i_xattr_loc) {
+                if (!xbs->xattr_bh) {
+                        ret = ocfs2_read_xattr_block(inode,
+                                                     le64_to_cpu(di->i_xattr_loc),
+                                                     &bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        xb = (struct ocfs2_xattr_block *)bh->b_data;
+                } else
+                        xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                /*
+                 * If there is already an xattr tree, good, we can calculate
+                 * like other b-trees. Otherwise we may have the chance of
+                 * create a tree, the credit calculation is borrowed from
+                 * ocfs2_calc_extend_credits with root_el = NULL. And the
+                 * new tree will be cluster based, so no meta is needed.
+                 */
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        struct ocfs2_extent_list *el =
+                                 &xb->xb_attrs.xb_root.xt_list;
+                        meta_add += ocfs2_extend_meta_needed(el);
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             el, 1);
+                } else
+                        credits += OCFS2_SUBALLOC_ALLOC + 1;
+                /*
+                 * This cluster will be used either for new bucket or for
+                 * new xattr block.
+                 * If the cluster size is the same as the bucket size, one
+                 * more is needed since we may need to extend the bucket
+                 * also.
+                 */
+                clusters_add += 1;
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (OCFS2_XATTR_BUCKET_SIZE ==
+                        OCFS2_SB(inode->i_sb)->s_clustersize) {
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                        clusters_add += 1;
+                }
+        } else {
+                meta_add += 1;
+                credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+out:
+        if (clusters_need)
+                *clusters_need = clusters_add;
+        if (meta_need)
+                *meta_need = meta_add;
+        if (credits_need)
+                *credits_need = credits;
+        brelse(bh);
+        return ret;
+}
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt,
+                                     int *credits)
+{
+        int clusters_add, meta_add, ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+        ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+        ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+                                        &clusters_add, &meta_add, credits);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+             "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+        if (meta_add) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+                                                        &ctxt->meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (clusters_add) {
+                ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+                if (ret)
+                        mlog_errno(ret);
+        }
+out:
+        if (ret) {
+                if (ctxt->meta_ac) {
+                        ocfs2_free_alloc_context(ctxt->meta_ac);
+                        ctxt->meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null ctxt->data_ac.
+                 */
+        }
+        return ret;
+}
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+                                    struct ocfs2_dinode *di,
+                                    struct ocfs2_xattr_info *xi,
+                                    struct ocfs2_xattr_search *xis,
+                                    struct ocfs2_xattr_search *xbs,
+                                    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret = 0, credits, old_found;
+        if (!xi->value) {
+                /* Remove existing extended attribute */
+                if (!xis->not_found)
+                        ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                else if (!xbs->not_found)
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+        } else {
+                /* We always try to set extended attribute into inode first*/
+                ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                if (!ret && !xbs->not_found) {
+                        /*
+                         * If succeed and that extended attribute existing in
+                         * external block, then we will remove it.
+                         */
+                        xi->value = NULL;
+                        xi->value_len = 0;
+                        old_found = xis->not_found;
+                        xis->not_found = -ENODATA;
+                        ret = ocfs2_calc_xattr_set_need(inode,
+                                                        di,
+                                                        xi,
+                                                        xis,
+                                                        xbs,
+                                                        NULL,
+                                                        NULL,
+                                                        &credits);
+                        xis->not_found = old_found;
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                } else if (ret == -ENOSPC) {
+                        if (di->i_xattr_loc && !xbs->xattr_bh) {
+                                ret = ocfs2_xattr_block_find(inode,
+                                                             xi->name_index,
+                                                             xi->name, xbs);
+                                if (ret)
+                                        goto out;
+                                old_found = xis->not_found;
+                                xis->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                xis->not_found = old_found;
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                        }
+                        /*
+                         * If no space in inode, we will set extended attribute
+                         * into external block.
+                         */
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                        if (ret)
+                                goto out;
+                        if (!xis->not_found) {
+                                /*
+                                 * If succeed and that extended attribute
+                                 * existing in inode, we will remove it.
+                                 */
+                                xi->value = NULL;
+                                xi->value_len = 0;
+                                xbs->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                                ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_xattr_ibody_set(inode, xi,
+                                                            xis, ctxt);
+                        }
+                }
+        }
+        if (!ret) {
+                /* Update inode ctime. */
+                ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                inode->i_ctime = CURRENT_TIME;
+                di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+                di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+        }
+out:
+        return ret;
+}
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+                           struct inode *inode,
+                           struct buffer_head *di_bh,
+                           int name_index,
+                           const char *name,
+                           const void *value,
+                           size_t value_len,
+                           int flags,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_dinode *di;
+        int ret;
+        struct ocfs2_xattr_info xi = {
+                .name_index = name_index,
+                .name = name,
+                .value = value,
+                .value_len = value_len,
+        };
+        struct ocfs2_xattr_search xis = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_search xbs = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_set_ctxt ctxt = {
+                .handle = handle,
+                .meta_ac = meta_ac,
+                .data_ac = data_ac,
+        };
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return -EOPNOTSUPP;
+        /*
+         * In extreme situation, may need xattr bucket when
+         * block size is too small. And we have already reserved
+         * the credits for bucket in mknod.
+         */
+        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+                xbs.bucket = ocfs2_xattr_bucket_new(inode);
+                if (!xbs.bucket) {
+                        mlog_errno(-ENOMEM);
+                        return -ENOMEM;
+                }
+        }
+        xis.inode_bh = xbs.inode_bh = di_bh;
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_write(&OCFS2_I(inode)->ip_xattr_sem);
+        ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+        if (ret)
+                goto cleanup;
+        if (xis.not_found) {
+                ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+                if (ret)
+                        goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+cleanup:
+        up_write(&OCFS2_I(inode)->ip_xattr_sem);
+        brelse(xbs.xattr_bh);
+        ocfs2_xattr_bucket_free(xbs.bucket);
+        return ret;
+}
 /*
 * ocfs2_xattr_set()
 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
 {
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        int ret;
+        int ret, credits;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
        struct ocfs2_xattr_info xi = {
                .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
                return -EOPNOTSUPP;
+        /*
+         * Only xbs will be used on indexed trees.  xis doesn't need a
+         * bucket.
+         */
+        xbs.bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xbs.bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
-                return ret;
+                goto cleanup_nolock;
        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
                        goto cleanup;
        }
-        if (!value) {
-                /* Remove existing extended attribute */
+        mutex_lock(&tl_inode->i_mutex);
-                if (!xis.not_found)
-                        ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
-                else if (!xbs.not_found)
+                ret = __ocfs2_flush_truncate_log(osb);
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+                if (ret < 0) {
-        } else {
+                        mutex_unlock(&tl_inode->i_mutex);
-                /* We always try to set extended attribute into inode first*/
+                        mlog_errno(ret);
-                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                        goto cleanup;
-                if (!ret && !xbs.not_found) {
-                        /*
-                         * If succeed and that extended attribute existing in
-                         * external block, then we will remove it.
-                         */
-                        xi.value = NULL;
-                        xi.value_len = 0;
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                } else if (ret == -ENOSPC) {
-                        if (di->i_xattr_loc && !xbs.xattr_bh) {
-                                ret = ocfs2_xattr_block_find(inode, name_index,
-                                                             name, &xbs);
-                                if (ret)
-                                        goto cleanup;
-                        }
-                        /*
-                         * If no space in inode, we will set extended attribute
-                         * into external block.
-                         */
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                        if (ret)
-                                goto cleanup;
-                        if (!xis.not_found) {
-                                /*
-                                 * If succeed and that extended attribute
-                                 * existing in inode, we will remove it.
-                                 */
-                                xi.value = NULL;
-                                xi.value_len = 0;
-                                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-                        }
                }
        }
+        mutex_unlock(&tl_inode->i_mutex);
+        ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+                                        &xbs, &ctxt, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        /* we need to update inode's ctime field, so add credit for it. */
+        credits += OCFS2_INODE_UPDATE_CREDITS;
+        ctxt.handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+        ocfs2_commit_trans(osb, ctxt.handle);
+        if (ctxt.data_ac)
+                ocfs2_free_alloc_context(ctxt.data_ac);
+        if (ctxt.meta_ac)
+                ocfs2_free_alloc_context(ctxt.meta_ac);
+        if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+                ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
        up_write(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
        brelse(di_bh);
        brelse(xbs.xattr_bh);
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(xbs.bucket);
-                brelse(xbs.bucket.bhs[i]);
        return ret;
 }
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
                                void *para);
 static int ocfs2_find_xe_in_bucket(struct inode *inode,
-                                   struct buffer_head *header_bh,
+                                   struct ocfs2_xattr_bucket *bucket,
                                   int name_index,
                                   const char *name,
                                   u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                                   int *found)
 {
        int i, ret = 0, cmp = 1, block_off, new_offset;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t name_len = strlen(name);
        struct ocfs2_xattr_entry *xe = NULL;
-        struct buffer_head *name_bh = NULL;
        char *xe_name;
        /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                        break;
                }
-                ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
-                                       &name_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
-                xe_name = name_bh->b_data + new_offset;
-                cmp = memcmp(name, xe_name, name_len);
-                brelse(name_bh);
-                name_bh = NULL;
-                if (cmp == 0) {
+                xe_name = bucket_block(bucket, block_off) + new_offset;
+                if (!memcmp(name, xe_name, name_len)) {
                        *xe_index = i;
                        *found = 1;
                        ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                                   struct ocfs2_xattr_search *xs)
 {
        int ret, found = 0;
-        struct buffer_head *bh = NULL;
-        struct buffer_head *lower_bh = NULL;
        struct ocfs2_xattr_header *xh = NULL;
        struct ocfs2_xattr_entry *xe = NULL;
        u16 index = 0;
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int low_bucket = 0, bucket, high_bucket;
+        struct ocfs2_xattr_bucket *search;
        u32 last_hash;
-        u64 blkno;
+        u64 blkno, lower_blkno = 0;
+        search = ocfs2_xattr_bucket_new(inode);
+        if (!search) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        ret = ocfs2_read_block(inode, p_blkno, &bh);
+        ret = ocfs2_read_xattr_bucket(search, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        xh = bucket_xh(search);
        high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
        while (low_bucket <= high_bucket) {
-                brelse(bh);
+                ocfs2_xattr_bucket_relse(search);
-                bh = NULL;
-                bucket = (low_bucket + high_bucket) / 2;
+                bucket = (low_bucket + high_bucket) / 2;
                blkno = p_blkno + bucket * blk_per_bucket;
+                ret = ocfs2_read_xattr_bucket(search, blkno);
-                ret = ocfs2_read_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                xh = (struct ocfs2_xattr_header *)bh->b_data;
+                xh = bucket_xh(search);
                xe = &xh->xh_entries[0];
                if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
                        high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                last_hash = le32_to_cpu(xe->xe_name_hash);
-                /* record lower_bh which may be the insert place. */
+                /* record lower_blkno which may be the insert place. */
-                brelse(lower_bh);
+                lower_blkno = blkno;
-                lower_bh = bh;
-                bh = NULL;
                if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
                        low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                }
                /* the searched xattr should reside in this bucket if exists. */
-                ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+                ret = ocfs2_find_xe_in_bucket(inode, search,
                                              name_index, name, name_hash,
                                              &index, &found);
                if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
         * When the xattr's hash value is in the gap of 2 buckets, we will
         * always set it to the previous bucket.
         */
-        if (!lower_bh) {
+        if (!lower_blkno)
-                /*
+                lower_blkno = p_blkno;
-                 * We can't find any bucket whose first name_hash is less
-                 * than the find name_hash.
+        /* This should be in cache - we just read it during the search */
-                 */
+        ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
-                BUG_ON(bh->b_blocknr != p_blkno);
+        if (ret) {
-                lower_bh = bh;
+                mlog_errno(ret);
-                bh = NULL;
+                goto out;
        }
-        xs->bucket.bhs[0] = lower_bh;
-        xs->bucket.xh = (struct ocfs2_xattr_header *)
-                                        xs->bucket.bhs[0]->b_data;
-        lower_bh = NULL;
-        xs->header = xs->bucket.xh;
+        xs->header = bucket_xh(xs->bucket);
-        xs->base = xs->bucket.bhs[0]->b_data;
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
        if (found) {
-                /*
-                 * If we have found the xattr enty, read all the blocks in
-                 * this bucket.
-                 */
-                ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
                xs->here = &xs->header->xh_entries[index];
                mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-                     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+                     (unsigned long long)bucket_blkno(xs->bucket), index);
        } else
                ret = -ENODATA;
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(search);
-        brelse(lower_bh);
        return ret;
 }
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                                       xattr_bucket_func *func,
                                       void *para)
 {
-        int i, j, ret = 0;
+        int i, ret = 0;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
        u32 num_buckets = clusters * bpc;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
-        memset(&bucket, 0, sizeof(bucket));
+        bucket = ocfs2_xattr_bucket_new(inode);
+        if (!bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
             clusters, (unsigned long long)blkno);
-        for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+        for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
-                ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+                ret = ocfs2_read_xattr_bucket(bucket, blkno);
-                                        bucket.bhs, 0);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        break;
                }
-                bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
                /*
                 * The real bucket num in this series of blocks is stored
                 * in the 1st bucket.
                 */
                if (i == 0)
-                        num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+                        num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
                mlog(0, "iterating xattr bucket %llu, first hash %u\n",
                     (unsigned long long)blkno,
-                     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
-                        ret = func(inode, &bucket, para);
+                        ret = func(inode, bucket, para);
-                        if (ret) {
+                        if (ret)
                                mlog_errno(ret);
-                                break;
+                        /* Fall through to bucket_relse() */
-                        }
                }
-                for (j = 0; j < blk_per_bucket; j++)
+                ocfs2_xattr_bucket_relse(bucket);
-                        brelse(bucket.bhs[j]);
+                if (ret)
-                memset(&bucket, 0, sizeof(bucket));
+                        break;
        }
-out:
+        ocfs2_xattr_bucket_free(bucket);
-        for (j = 0; j < blk_per_bucket; j++)
-                brelse(bucket.bhs[j]);
        return ret;
 }
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
        int i, block_off, new_offset;
        const char *prefix, *name;
-        for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+        for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
-                struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+                struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
                type = ocfs2_xattr_get_type(entry);
                prefix = ocfs2_xattr_prefix(type);
                if (prefix) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                bucket->xh,
+                                                                bucket_xh(bucket),
                                                                i,
                                                                &block_off,
                                                                &new_offset);
                        if (ret)
                                break;
-                        name = (const char *)bucket->bhs[block_off]->b_data +
+                        name = (const char *)bucket_block(bucket, block_off) +
                                new_offset;
                        ret = ocfs2_xattr_list_entry(xl->buffer,
                                                     xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
 /*
 * When the ocfs2_xattr_block is filled up, new bucket will be created
 * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
 * Note: we need to sort the entries since they are not saved in order
 * in the ocfs2_xattr_block.
 */
 static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
                                           struct buffer_head *xb_bh,
-                                           struct buffer_head *xh_bh,
+                                           struct ocfs2_xattr_bucket *bucket)
-                                           struct buffer_head *data_bh)
 {
        int i, blocksize = inode->i_sb->s_blocksize;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 offset, size, off_change;
        struct ocfs2_xattr_entry *xe;
        struct ocfs2_xattr_block *xb =
                                (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                                (struct ocfs2_xattr_header *)xh_bh->b_data;
        u16 count = le16_to_cpu(xb_xh->xh_count);
-        char *target = xh_bh->b_data, *src = xb_bh->b_data;
+        char *src = xb_bh->b_data;
+        char *target = bucket_block(bucket, blks - 1);
        mlog(0, "cp xattr from block %llu to bucket %llu\n",
             (unsigned long long)xb_bh->b_blocknr,
-             (unsigned long long)xh_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(bucket));
+        for (i = 0; i < blks; i++)
+                memset(bucket_block(bucket, i), 0, blocksize);
-        memset(xh_bh->b_data, 0, blocksize);
-        if (data_bh)
-                memset(data_bh->b_data, 0, blocksize);
        /*
         * Since the xe_name_offset is based on ocfs2_xattr_header,
         * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        size = blocksize - offset;
        /* copy all the names and values. */
-        if (data_bh)
-                target = data_bh->b_data;
        memcpy(target + offset, src + offset, size);
        /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
        /* copy all the entries. */
-        target = xh_bh->b_data;
+        target = bucket_block(bucket, 0);
        offset = offsetof(struct ocfs2_xattr_header, xh_entries);
        size = count * sizeof(struct ocfs2_xattr_entry);
        memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 * While if the entry is in index b-tree, "bucket" indicates the
 * real place of the xattr.
 */
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
-                                           struct ocfs2_xattr_search *xs,
+                                            struct ocfs2_xattr_search *xs,
-                                           struct buffer_head *old_bh,
+                                            struct buffer_head *old_bh)
-                                           struct buffer_head *new_bh)
 {
-        int ret = 0;
        char *buf = old_bh->b_data;
        struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
        struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
-        int i, blocksize = inode->i_sb->s_blocksize;
+        int i;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        xs->bucket.bhs[0] = new_bh;
-        get_bh(new_bh);
-        xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
-        xs->header = xs->bucket.xh;
-        xs->base = new_bh->b_data;
+        xs->header = bucket_xh(xs->bucket);
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
-        if (!xs->not_found) {
+        if (xs->not_found)
-                if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+                return;
-                        ret = ocfs2_read_blocks(inode,
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                        if (ret) {
-                                mlog_errno(ret);
-                                return ret;
-                        }
-                }
-                i = xs->here - old_xh->xh_entries;
-                xs->here = &xs->header->xh_entries[i];
-        }
-        return ret;
+        i = xs->here - old_xh->xh_entries;
+        xs->here = &xs->header->xh_entries[i];
 }
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs)
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits = OCFS2_SUBALLOC_ALLOC;
+        int ret;
        u32 bit_off, len;
        u64 blkno;
-        handle_t *handle;
+        handle_t *handle = ctxt->handle;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_alloc_context *data_ac;
-        struct buffer_head *xh_bh = NULL, *data_bh = NULL;
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xr;
        u16 xb_flags = le16_to_cpu(xb->xb_flags);
-        u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog(0, "create xattr index block for %llu\n",
             (unsigned long long)xb_bh->b_blocknr);
        BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+        BUG_ON(!xs->bucket);
-        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /*
         * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
         */
        down_write(&oi->ip_alloc_sem);
-        /*
+        ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
-         * 3 more credits, one for xattr block update, one for the 1st block
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-         * of the new xattr bucket and one for the value/data.
-         */
-        credits += 3;
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out_sem;
-        }
-        ret = ocfs2_journal_access(handle, inode, xb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        mlog(0, "allocate 1 cluster from %llu to xattr block\n",
             (unsigned long long)blkno);
-        xh_bh = sb_getblk(inode->i_sb, blkno);
+        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
-        if (!xh_bh) {
+        if (ret) {
-                ret = -EIO;
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+                                                OCFS2_JOURNAL_ACCESS_CREATE);
-        ret = ocfs2_journal_access(handle, inode, xh_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
-        }
-        if (bpb > 1) {
-                data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
-                if (!data_bh) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, data_bh);
-                ret = ocfs2_journal_access(handle, inode, data_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
        }
-        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        ocfs2_journal_dirty(handle, xh_bh);
-        if (data_bh)
-                ocfs2_journal_dirty(handle, data_bh);
-        ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+        ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
-        ret = ocfs2_journal_dirty(handle, xb_bh);
+        ocfs2_journal_dirty(handle, xb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out_sem:
-        up_write(&oi->ip_alloc_sem);
 out:
-        if (data_ac)
+        up_write(&oi->ip_alloc_sem);
-                ocfs2_free_alloc_context(data_ac);
-        brelse(xh_bh);
-        brelse(data_bh);
        return ret;
 }
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
 * so that we can spare some space for insertion.
 */
 static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+                                     handle_t *handle,
                                     struct ocfs2_xattr_bucket *bucket)
 {
        int ret, i;
        size_t end, offset, len, value_len;
        struct ocfs2_xattr_header *xh;
        char *entries, *buf, *bucket_buf = NULL;
-        u64 blkno = bucket->bhs[0]->b_blocknr;
+        u64 blkno = bucket_blkno(bucket);
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 xh_free_start;
        size_t blocksize = inode->i_sb->s_blocksize;
-        handle_t *handle;
-        struct buffer_head **bhs;
        struct ocfs2_xattr_entry *xe;
-        bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-                        GFP_NOFS);
-        if (!bhs)
-                return -ENOMEM;
-        ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
-        if (ret)
-                goto out;
        /*
         * In order to make the operation more efficient and generic,
         * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        }
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(buf, bhs[i]->b_data, blocksize);
+                memcpy(buf, bucket_block(bucket, i), blocksize);
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto commit;
-                }
-        }
        xh = (struct ocfs2_xattr_header *)bucket_buf;
        entries = (char *)xh->xh_entries;
        xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
                        "bucket %llu\n", (unsigned long long)blkno);
        if (xh_free_start == end)
-                goto commit;
+                goto out;
        memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
        xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
             cmp_xe, swap_xe);
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(bhs[i]->b_data, buf, blocksize);
+                memcpy(bucket_block(bucket, i), buf, blocksize);
-                ocfs2_journal_dirty(handle, bhs[i]);
+        ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-        }
-commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-        if (bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(bhs[i]);
-        }
-        kfree(bhs);
        kfree(bucket_buf);
        return ret;
 }
 /*
- * Move half nums of the xattr bucket in the previous cluster to this new
+ * prev_blkno points to the start of an existing extent.  new_blkno
- * cluster. We only touch the last cluster of the previous extend record.
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
 *
- * first_bh is the first buffer_head of a series of bucket in the same
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
 */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
                                               handle_t *handle,
-                                               struct buffer_head **first_bh,
+                                               struct ocfs2_xattr_bucket *first,
-                                               struct buffer_head **header_bh,
+                                               struct ocfs2_xattr_bucket *target,
                                               u64 new_blkno,
-                                               u64 prev_blkno,
                                               u32 num_clusters,
                                               u32 *first_hash)
 {
-        int i, ret, credits;
+        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
-        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+        int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        int blocksize = inode->i_sb->s_blocksize;
+        int to_move = num_buckets / 2;
-        struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+        u64 src_blkno;
-        struct ocfs2_xattr_header *new_xh;
+        u64 last_cluster_blkno = bucket_blkno(first) +
-        struct ocfs2_xattr_header *xh =
+                ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
-                        (struct ocfs2_xattr_header *)((*first_bh)->b_data);
-        BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
-        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-        prev_bh = *first_bh;
-        get_bh(prev_bh);
-        xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
-        prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+        BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-             (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+             (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
-        /*
+        ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
-         * We need to update the 1st half of the new cluster and
+                                     last_cluster_blkno, new_blkno,
-         * 1 more for the update of the 1st bucket of the previous
+                                     to_move, first_hash);
-         * extent record.
-         */
-        credits = bpc / 2 + 1;
-        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, prev_bh,
+        /* This is the first bucket that got moved */
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+        /*
-                old_bh = new_bh = NULL;
+         * If the target bucket was part of the moved buckets, we need to
-                new_bh = sb_getblk(inode->i_sb, new_blkno);
+         * update first and target.
-                if (!new_bh) {
+         */
-                        ret = -EIO;
+        if (bucket_blkno(target) >= src_blkno) {
-                        mlog_errno(ret);
+                /* Find the block for the new target bucket */
-                        goto out;
+                src_blkno = new_blkno +
-                }
+                        (bucket_blkno(target) - src_blkno);
-                ocfs2_set_new_buffer_uptodate(inode, new_bh);
+                ocfs2_xattr_bucket_relse(first);
+                ocfs2_xattr_bucket_relse(target);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                /*
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                 * These shouldn't fail - the buffers are in the
-                if (ret < 0) {
+                 * journal from ocfs2_cp_xattr_bucket().
+                 */
+                ret = ocfs2_read_xattr_bucket(first, new_blkno);
+                if (ret) {
                        mlog_errno(ret);
-                        brelse(new_bh);
                        goto out;
                }
+                ret = ocfs2_read_xattr_bucket(target, src_blkno);
-                ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+                if (ret)
-                if (ret < 0) {
                        mlog_errno(ret);
-                        brelse(new_bh);
-                        goto out;
-                }
-                memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-                if (i == 0) {
-                        new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
-                        new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-                        if (first_hash)
-                                *first_hash = le32_to_cpu(
-                                        new_xh->xh_entries[0].xe_name_hash);
-                        new_first_bh = new_bh;
-                        get_bh(new_first_bh);
-                }
-                ocfs2_journal_dirty(handle, new_bh);
-                if (*header_bh == old_bh) {
-                        brelse(*header_bh);
-                        *header_bh = new_bh;
-                        get_bh(*header_bh);
-                        brelse(*first_bh);
-                        *first_bh = new_first_bh;
-                        get_bh(*first_bh);
-                }
-                brelse(new_bh);
-                brelse(old_bh);
        }
-        le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-        ocfs2_journal_dirty(handle, prev_bh);
 out:
-        brelse(prev_bh);
-        brelse(new_first_bh);
-        return ret;
-}
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-                                   u64 blkno,
-                                   struct buffer_head **bhs,
-                                   int new)
-{
-        int ret = 0;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        if (!new)
-                return ocfs2_read_blocks(inode, blkno,
-                                         blk_per_bucket, bhs, 0);
-        for (i = 0; i < blk_per_bucket; i++) {
-                bhs[i] = sb_getblk(inode->i_sb, blkno + i);
-                if (bhs[i] == NULL) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        break;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-        }
        return ret;
 }
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 {
        int ret, i;
        int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        mlog(0, "move some of xattrs from bucket %llu to %llu\n",
             (unsigned long long)blk, (unsigned long long)new_blk);
-        s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-                return -ENOMEM;
+        if (!s_bucket || !t_bucket) {
+                ret = -ENOMEM;
-        ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
-        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+        ret = ocfs2_read_xattr_bucket(s_bucket, blk);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
-        if (!t_bhs) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = -ENOMEM;
+        if (ret) {
+                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+        /*
+         * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+         * there's no need to read it.
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
+        /*
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+         * Hey, if we're overwriting t_bucket, what difference does
-                                           new_bucket_head ?
+         * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
+         * same part of ocfs2_cp_xattr_bucket().
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+         */
-                if (ret) {
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
-                        mlog_errno(ret);
+                                                new_bucket_head ?
-                        goto out;
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
-                }
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        count = le16_to_cpu(xh->xh_count);
        start = ocfs2_xattr_find_divide_pos(xh);
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
                 * The hash value is set as one larger than
                 * that of the last entry in the previous bucket.
                 */
-                for (i = 0; i < blk_per_bucket; i++)
+                for (i = 0; i < t_bucket->bu_blocks; i++)
-                        memset(t_bhs[i]->b_data, 0, blocksize);
+                        memset(bucket_block(t_bucket, i), 0, blocksize);
-                xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+                xh = bucket_xh(t_bucket);
                xh->xh_free_start = cpu_to_le16(blocksize);
                xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
                le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        }
        /* copy the whole bucket to the new first. */
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
        /* update the new bucket. */
-        xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+        xh = bucket_xh(t_bucket);
        /*
         * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
        else
                xh->xh_num_buckets = 0;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-                if (ret)
-                        mlog_errno(ret);
-        }
        /* store the first_hash of the new bucket. */
        if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
        if (start == count)
                goto out;
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        memset(&xh->xh_entries[start], 0,
               sizeof(struct ocfs2_xattr_entry) * (count - start));
        xh->xh_count = cpu_to_le16(start);
        xh->xh_free_start = cpu_to_le16(name_offset);
        xh->xh_name_value_len = cpu_to_le16(name_value_len);
-        ocfs2_journal_dirty(handle, s_bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
-        if (ret)
-                mlog_errno(ret);
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(s_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(t_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
                                 u64 t_blkno,
                                 int t_is_new)
 {
-        int ret, i;
+        int ret;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        int blocksize = inode->i_sb->s_blocksize;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        BUG_ON(s_blkno == t_blkno);
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
             (unsigned long long)s_blkno, (unsigned long long)t_blkno,
             t_is_new);
-        s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-                        GFP_NOFS);
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        if (!s_bucket || !t_bucket) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+        ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
        if (ret)
                goto out;
-        t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        /*
-                        GFP_NOFS);
+         * Even if !t_is_new, we're overwriting t_bucket.  Thus,
-        if (!t_bhs) {
+         * there's no need to read it.
-                ret = -ENOMEM;
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+        if (ret)
                goto out;
-        }
-        ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+        /*
+         * Hey, if we're overwriting t_bucket, what difference does
+         * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+         * cluster to fill, we came here from
+         * ocfs2_mv_xattr_buckets(), and it is really new -
+         * ACCESS_CREATE is required.  But we also might have moved data
+         * out of t_bucket before extending back into it.
+         * ocfs2_add_new_xattr_bucket() can do this - its call to
+         * ocfs2_add_new_xattr_cluster() may have created a new extent
+         * and copied out the end of the old extent.  Then it re-extends
+         * the old extent back to create space for new xattrs.  That's
+         * how we get here, and the bucket isn't really new.
+         */
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+                                                t_is_new ?
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret)
                goto out;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                                           t_is_new ?
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret)
-                        goto out;
-        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-        }
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(t_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(s_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
 /*
- * Copy one xattr cluster from src_blk to to_blk.
+ * src_blk points to the start of an existing extent.  last_blk points to
- * The to_blk will become the first bucket header of the cluster, so its
+ * last cluster in that extent.  to_blk points to a newly allocated
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
 */
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
-                                  handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
-                                  struct buffer_head *first_bh,
+                                  unsigned int start_bucket,
-                                  u64 src_blk,
-                                  u64 to_blk,
                                  u32 *first_hash)
 {
        int i, ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_bucket *old_first, *new_first;
-        struct ocfs2_xattr_header *xh;
-        u64 to_blk_start = to_blk;
+        mlog(0, "mv xattrs from cluster %llu to %llu\n",
+             (unsigned long long)last_blk, (unsigned long long)to_blk);
+        BUG_ON(start_bucket >= num_buckets);
+        if (start_bucket) {
+                num_buckets -= start_bucket;
+                last_blk += (start_bucket * blks_per_bucket);
+        }
+        /* The first bucket of the original extent */
+        old_first = ocfs2_xattr_bucket_new(inode);
+        /* The first bucket of the new extent */
+        new_first = ocfs2_xattr_bucket_new(inode);
+        if (!old_first || !new_first) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        mlog(0, "cp xattrs from cluster %llu to %llu\n",
+        ret = ocfs2_read_xattr_bucket(old_first, src_blk);
-             (unsigned long long)src_blk, (unsigned long long)to_blk);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
-         * We need to update the new cluster and 1 more for the update of
+         * We need to update the first bucket of the old extent and all
-         * the 1st bucket of the previous extent rec.
+         * the buckets going to the new extent.
         */
-        credits = bpc + 1;
+        credits = ((num_buckets + 1) * blks_per_bucket) +
+                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
        for (i = 0; i < num_buckets; i++) {
                ret = ocfs2_cp_xattr_bucket(inode, handle,
-                                            src_blk, to_blk, 1);
+                                            last_blk + (i * blks_per_bucket),
+                                            to_blk + (i * blks_per_bucket),
+                                            1);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-                to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        }
-        /* update the old bucket header. */
+        /*
-        xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+         * Get the new bucket ready before we dirty anything
-        le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+         * (This actually shouldn't fail, because we already dirtied
+         * it once in ocfs2_cp_xattr_bucket()).
-        ocfs2_journal_dirty(handle, first_bh);
+         */
+        ret = ocfs2_read_xattr_bucket(new_first, to_blk);
-        /* update the new bucket header. */
+        if (ret) {
-        ret = ocfs2_read_block(inode, to_blk_start, &bh);
-        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
-        ret = ocfs2_journal_access(handle, inode, bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        /* Now update the headers */
-        xh->xh_num_buckets = cpu_to_le16(num_buckets);
+        le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, old_first);
-        ocfs2_journal_dirty(handle, bh);
+        bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, new_first);
        if (first_hash)
-                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+                *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(new_first);
+        ocfs2_xattr_bucket_free(old_first);
        return ret;
 }
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket;
+        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 */
 static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head **first_bh,
+                                            struct ocfs2_xattr_bucket *first,
-                                            struct buffer_head **header_bh,
+                                            struct ocfs2_xattr_bucket *target,
                                            u64 new_blk,
-                                            u64 prev_blk,
                                            u32 prev_clusters,
                                            u32 *v_start,
                                            int *extend)
 {
-        int ret = 0;
+        int ret;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-             (unsigned long long)prev_blk, prev_clusters,
+             (unsigned long long)bucket_blkno(first), prev_clusters,
             (unsigned long long)new_blk);
-        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
                                                          handle,
-                                                          first_bh,
+                                                          first, target,
-                                                          header_bh,
                                                          new_blk,
-                                                          prev_blk,
                                                          prev_clusters,
                                                          v_start);
-        else {
+                if (ret)
-                u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+                        mlog_errno(ret);
+        } else {
-                if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+                /* The start of the last cluster in the first extent */
-                        ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+                u64 last_blk = bucket_blkno(first) +
-                                                     last_blk, new_blk,
+                        ((prev_clusters - 1) *
+                         ocfs2_clusters_to_blocks(inode->i_sb, 1));
+                if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+                        ret = ocfs2_mv_xattr_buckets(inode, handle,
+                                                     bucket_blkno(first),
+                                                     last_blk, new_blk, 0,
                                                     v_start);
-                else {
+                        if (ret)
+                                mlog_errno(ret);
+                } else {
                        ret = ocfs2_divide_xattr_cluster(inode, handle,
                                                         last_blk, new_blk,
                                                         v_start);
+                        if (ret)
+                                mlog_errno(ret);
-                        if ((*header_bh)->b_blocknr == last_blk && extend)
+                        if ((bucket_blkno(target) == last_blk) && extend)
                                *extend = 0;
                }
        }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 */
 static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                                       struct buffer_head *root_bh,
-                                       struct buffer_head **first_bh,
+                                       struct ocfs2_xattr_bucket *first,
-                                       struct buffer_head **header_bh,
+                                       struct ocfs2_xattr_bucket *target,
                                       u32 *num_clusters,
                                       u32 prev_cpos,
-                                       u64 prev_blkno,
+                                       int *extend,
-                                       int *extend)
+                                       struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits;
+        int ret;
        u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 prev_clusters = *num_clusters;
        u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
        u64 block;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
             "previous xattr blkno = %llu\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             prev_cpos, (unsigned long long)prev_blkno);
+             prev_cpos, (unsigned long long)bucket_blkno(first));
        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
-        ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                    &data_ac, &meta_ac);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(ret);
-                goto leave;
-        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        if (prev_blkno + prev_clusters * bpc == block &&
+        if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
            (prev_clusters + num_bits) << osb->s_clustersize_bits <=
             OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
                /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        } else {
                ret = ocfs2_adjust_xattr_cross_cluster(inode,
                                                       handle,
-                                                       first_bh,
+                                                       first,
-                                                       header_bh,
+                                                       target,
                                                       block,
-                                                       prev_blkno,
                                                       prev_clusters,
                                                       &v_start,
                                                       extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                }
        }
-        if (handle->h_buffer_credits < credits) {
-                /*
-                 * The journal has been restarted before, and don't
-                 * have enough space for the insertion, so extend it
-                 * here.
-                 */
-                ret = ocfs2_extend_trans(handle, credits);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto leave;
-                }
-        }
        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
             num_bits, (unsigned long long)block, v_start);
        ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-                                  num_bits, 0, meta_ac);
+                                  num_bits, 0, ctxt->meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0) {
+        if (ret < 0)
                mlog_errno(ret);
-                goto leave;
-        }
 leave:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (data_ac)
-                ocfs2_free_alloc_context(data_ac);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 /*
- * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We are given an extent.  'first' is the bucket at the very front of
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
 */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
-                                     struct buffer_head *first_bh,
+                                     handle_t *handle,
-                                     struct buffer_head *start_bh,
+                                     struct ocfs2_xattr_bucket *first,
+                                     u64 target_blk,
                                     u32 num_clusters)
 {
        int ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        u64 start_blk = start_bh->b_blocknr, end_blk;
+        u64 end_blk;
-        u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+        u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
-        handle_t *handle;
-        struct ocfs2_xattr_header *first_xh =
-                                (struct ocfs2_xattr_header *)first_bh->b_data;
-        u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-             "from %llu, len = %u\n", (unsigned long long)start_blk,
+             "from %llu, len = %u\n", (unsigned long long)target_blk,
-             (unsigned long long)first_bh->b_blocknr, num_clusters);
+             (unsigned long long)bucket_blkno(first), num_clusters);
-        BUG_ON(bucket >= num_buckets);
+        /* The extent must have room for an additional bucket */
+        BUG_ON(new_bucket >=
+               (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
-        end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+        /* end_blk points to the last existing bucket */
+        end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
        /*
-         * We will touch all the buckets after the start_bh(include it).
+         * end_blk is the start of the last existing bucket.
-         * Add one more bucket and modify the first_bh.
+         * Thus, (end_blk - target_blk) covers the target bucket and
+         * every bucket after it up to, but not including, the last
+         * existing bucket.  Then we add the last existing bucket, the
+         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
-        handle = ocfs2_start_trans(osb, credits);
+                  handle->h_buffer_credits;
-        if (IS_ERR(handle)) {
+        ret = ocfs2_extend_trans(handle, credits);
-                ret = PTR_ERR(handle);
+        if (ret) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto commit;
+                goto out;
        }
-        while (end_blk != start_blk) {
+        while (end_blk != target_blk) {
                ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
                                            end_blk + blk_per_bucket, 0);
                if (ret)
-                        goto commit;
+                        goto out;
                end_blk -= blk_per_bucket;
        }
-        /* Move half of the xattr in start_blk to the next bucket. */
+        /* Move half of the xattr in target_blkno to the next bucket. */
-        ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+        ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
-                                        start_blk + blk_per_bucket, NULL, 0);
+                                        target_blk + blk_per_bucket, NULL, 0);
-        le16_add_cpu(&first_xh->xh_num_buckets, 1);
+        le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
-        ocfs2_journal_dirty(handle, first_bh);
+        ocfs2_xattr_bucket_journal_dirty(handle, first);
-commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
 /*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * Add new xattr bucket in an extent record and adjust the buckets
- * xb_bh is the ocfs2_xattr_block.
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
- * We will move all the buckets starting from header_bh to the next place. As
+ * bucket we want to insert into.
- * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
 *
- * We will allocate a new cluster if current cluster is full and adjust
+ * If current cluster is full, we'll allocate a new one.  This may not
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
 */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                                      struct buffer_head *xb_bh,
-                                      struct buffer_head *header_bh)
+                                      struct ocfs2_xattr_bucket *target,
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        struct ocfs2_xattr_header *first_xh = NULL;
-        struct buffer_head *first_bh = NULL;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
        struct ocfs2_extent_list *el = &xb_root->xt_list;
-        struct ocfs2_xattr_header *xh =
+        u32 name_hash =
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
+                le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
-        u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct super_block *sb = inode->i_sb;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
        int ret, num_buckets, extend = 1;
        u64 p_blkno;
        u32 e_cpos, num_clusters;
+        /* The bucket at the front of the extent */
+        struct ocfs2_xattr_bucket *first;
-        mlog(0, "Add new xattr bucket starting form %llu\n",
+        mlog(0, "Add new xattr bucket starting from %llu\n",
-             (unsigned long long)header_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(target));
-        /*
+        /* The first bucket of the original extent */
-         * Add refrence for header_bh here because it may be
+        first = ocfs2_xattr_bucket_new(inode);
-         * changed in ocfs2_add_new_xattr_cluster and we need
+        if (!first) {
-         * to free it in the end.
+                ret = -ENOMEM;
-         */
+                mlog_errno(ret);
-        get_bh(header_bh);
+                goto out;
+        }
        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
                                  &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+        ret = ocfs2_read_xattr_bucket(first, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
-        first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+        if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+                /*
-        if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+                 * This can move first+target if the target bucket moves
+                 * to the new extent.
+                 */
                ret = ocfs2_add_new_xattr_cluster(inode,
                                                  xb_bh,
-                                                  &first_bh,
+                                                  first,
-                                                  &header_bh,
+                                                  target,
                                                  &num_clusters,
                                                  e_cpos,
-                                                  p_blkno,
+                                                  &extend,
-                                                  &extend);
+                                                  ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (extend)
+        if (extend) {
                ret = ocfs2_extend_xattr_bucket(inode,
-                                                first_bh,
+                                                ctxt->handle,
-                                                header_bh,
+                                                first,
+                                                bucket_blkno(target),
                                                num_clusters);
-        if (ret)
+                if (ret)
-                mlog_errno(ret);
+                        mlog_errno(ret);
+        }
 out:
-        brelse(first_bh);
+        ocfs2_xattr_bucket_free(first);
-        brelse(header_bh);
        return ret;
 }
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
        int block_off = offs >> inode->i_sb->s_blocksize_bits;
        offs = offs % inode->i_sb->s_blocksize;
-        return bucket->bhs[block_off]->b_data + offs;
+        return bucket_block(bucket, block_off) + offs;
 }
 /*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
                                xe->xe_value_size = 0;
                        val = ocfs2_xattr_bucket_get_val(inode,
-                                                         &xs->bucket, offs);
+                                                         xs->bucket, offs);
                        memset(val + OCFS2_XATTR_SIZE(name_len), 0,
                               size - OCFS2_XATTR_SIZE(name_len));
                        if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
                xh->xh_free_start = cpu_to_le16(offs);
        }
-        val = ocfs2_xattr_bucket_get_val(inode,
+        val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-                                         &xs->bucket, offs - size);
        xe->xe_name_offset = cpu_to_le16(offs - size);
        memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
        return;
 }
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
-                                             handle_t *handle,
-                                             struct ocfs2_xattr_search *xs,
-                                             struct buffer_head **bhs,
-                                             u16 bh_num)
-{
-        int ret = 0, off, block_off;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        /*
-         * First calculate all the blocks we should journal_access
-         * and journal_dirty. The first block should always be touched.
-         */
-        ret = ocfs2_journal_dirty(handle, bhs[0]);
-        if (ret)
-                mlog_errno(ret);
-        /* calc the data. */
-        off = le16_to_cpu(xe->xe_name_offset);
-        block_off = off >> inode->i_sb->s_blocksize_bits;
-        ret = ocfs2_journal_dirty(handle, bhs[block_off]);
-        if (ret)
-                mlog_errno(ret);
-        return ret;
-}
 /*
 * Set the xattr entry in the specified bucket.
 * The bucket is indicated by xs->bucket and it should have the enough
 * space for the xattr insertion.
 */
 static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_info *xi,
                                           struct ocfs2_xattr_search *xs,
                                           u32 name_hash,
                                           int local)
 {
-        int i, ret;
+        int ret;
-        handle_t *handle = NULL;
+        u64 blkno;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
             (unsigned long)xi->value_len, xi->name_index,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+             (unsigned long long)bucket_blkno(xs->bucket));
-        if (!xs->bucket.bhs[1]) {
+        if (!xs->bucket->bu_bhs[1]) {
-                ret = ocfs2_read_blocks(inode,
+                blkno = bucket_blkno(xs->bucket);
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
+                ocfs2_xattr_bucket_relse(xs->bucket);
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-                                        0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        handle = ocfs2_start_trans(osb, blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
        ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        /*Only dirty the blocks we have touched in set xattr. */
-        ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-                                                xs->bucket.bhs, blk_per_bucket);
-        if (ret)
-                mlog_errno(ret);
-out:
-        ocfs2_commit_trans(osb, handle);
-        return ret;
-}
-static int ocfs2_xattr_value_update_size(struct inode *inode,
-                                         struct buffer_head *xe_bh,
-                                         struct ocfs2_xattr_entry *xe,
-                                         u64 new_size)
-{
-        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle = NULL;
-        handle = ocfs2_start_trans(osb, 1);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        xe->xe_value_size = cpu_to_le64(new_size);
-        ret = ocfs2_journal_dirty(handle, xe_bh);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
@@ -4210,18 +4697,19 @@ out:
 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
 */
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
-                                             struct buffer_head *header_bh,
+                                             struct ocfs2_xattr_bucket *bucket,
                                             int xe_off,
-                                             int len)
+                                             int len,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        u64 value_blk;
-        struct buffer_head *value_bh = NULL;
-        struct ocfs2_xattr_value_root *xv;
        struct ocfs2_xattr_entry *xe;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t blocksize = inode->i_sb->s_blocksize;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_access = ocfs2_journal_access,
+        };
        xe = &xh->xh_entries[xe_off];
@@ -4234,49 +4722,57 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
        /* We don't allow ocfs2_xattr_value to be stored in different block. */
        BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
-        value_blk += header_bh->b_blocknr;
-        ret = ocfs2_read_block(inode, value_blk, &value_bh);
+        vb.vb_bh = bucket->bu_bhs[value_blk];
-        if (ret) {
+        BUG_ON(!vb.vb_bh);
-                mlog_errno(ret);
-                goto out;
-        }
-        xv = (struct ocfs2_xattr_value_root *)
+        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                (value_bh->b_data + offset % blocksize);
+                (vb.vb_bh->b_data + offset % blocksize);
+        /*
+         * From here on out we have to dirty the bucket.  The generic
+         * value calls only modify one of the bucket's bhs, but we need
+         * to send the bucket at once.  So if they error, they *could* have
+         * modified something.  We have to assume they did, and dirty
+         * the whole bucket.  This leaves us in a consistent state.
+         */
        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
-             xe_off, (unsigned long long)header_bh->b_blocknr, len);
+             xe_off, (unsigned long long)bucket_blkno(bucket), len);
-        ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+        ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+        ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
+        xe->xe_value_size = cpu_to_le64(len);
+        ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 out:
-        brelse(value_bh);
        return ret;
 }
 static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-                                                struct ocfs2_xattr_search *xs,
+                                        struct ocfs2_xattr_search *xs,
-                                                int len)
+                                        int len,
+                                        struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        struct ocfs2_xattr_entry *xe = xs->here;
        struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-        BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+        BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
        offset = xe - xh->xh_entries;
-        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-                                                offset, len);
+                                                offset, len, ctxt);
        if (ret)
                mlog_errno(ret);
@@ -4284,6 +4780,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 }
 static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+                                                handle_t *handle,
                                                struct ocfs2_xattr_search *xs,
                                                char *val,
                                                int value_len)
@@ -4299,7 +4796,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
        xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
-        return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+        return __ocfs2_xattr_set_value_outside(inode, handle,
+                                               xv, val, value_len);
 }
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4841,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -4392,26 +4890,19 @@ out:
 }
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+                                         handle_t *handle,
                                         struct ocfs2_xattr_search *xs)
 {
-        handle_t *handle = NULL;
+        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-        struct ocfs2_xattr_header *xh = xs->bucket.xh;
        struct ocfs2_xattr_entry *last = &xh->xh_entries[
                                                le16_to_cpu(xh->xh_count) - 1];
        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                return;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                return;
        }
        /* Remove the old entry. */
@@ -4420,11 +4911,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
        le16_add_cpu(&xh->xh_count, -1);
-        ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 }
 /*
@@ -4440,7 +4927,8 @@ out_commit:
 */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                                     struct ocfs2_xattr_info *xi,
-                                     struct ocfs2_xattr_search *xs)
+                                     struct ocfs2_xattr_search *xs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, local = 1;
        size_t value_len;
@@ -4468,7 +4956,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                        value_len = 0;
                ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                           value_len);
+                                                           value_len,
+                                                           ctxt);
                if (ret)
                        goto out;
@@ -4488,7 +4977,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                xi->value_len = OCFS2_XATTR_ROOT_SIZE;
        }
-        ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+        ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+                                              name_hash, local);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4499,7 +4989,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
        /* allocate the space now for the outside block storage. */
        ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                   value_len);
+                                                   value_len, ctxt);
        if (ret) {
                mlog_errno(ret);
@@ -4509,13 +4999,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                         * storage and we have allocated xattr already,
                         * so need to remove it.
                         */
-                        ocfs2_xattr_bucket_remove_xs(inode, xs);
+                        ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
                }
                goto out;
        }
 set_value_outside:
-        ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+        ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+                                                   xs, val, value_len);
 out:
        return ret;
 }
@@ -4530,7 +5021,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
                                              struct ocfs2_xattr_bucket *bucket,
                                              const char *name)
 {
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
        if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5031,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
            xh->xh_entries[0].xe_name_hash) {
                mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
                     "hash = %u\n",
-                     (unsigned long long)bucket->bhs[0]->b_blocknr,
+                     (unsigned long long)bucket_blkno(bucket),
                     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
                return -ENOSPC;
        }
@@ -4550,16 +5041,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs)
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        u16 count, header_size, xh_free_start;
-        int i, free, max_free, need, old;
+        int free, max_free, need, old;
        size_t value_size = 0, name_len = strlen(xi->name);
        size_t blocksize = inode->i_sb->s_blocksize;
        int ret, allocation = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog_entry("Set xattr %s in xattr index block\n", xi->name);
@@ -4574,7 +5065,7 @@ try_again:
        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
                        "of %u which exceed block size\n",
-                        (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+                        (unsigned long long)bucket_blkno(xs->bucket),
                        header_size);
        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5105,13 @@ try_again:
        mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
             "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
             " %u\n", xs->not_found,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+             (unsigned long long)bucket_blkno(xs->bucket),
             free, need, max_free, le16_to_cpu(xh->xh_free_start),
             le16_to_cpu(xh->xh_name_value_len));
-        if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+        if (free < need ||
+            (xs->not_found &&
+             count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
                if (need <= max_free &&
                    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
                        /*
@@ -4626,7 +5119,8 @@ try_again:
                         * name/value will be moved, the xe shouldn't be changed
                         * in xs.
                         */
-                        ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+                        ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+                                                        xs->bucket);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -4658,7 +5152,7 @@ try_again:
                 * add a new bucket for the insert.
                 */
                ret = ocfs2_check_xattr_bucket_collision(inode,
-                                                         &xs->bucket,
+                                                         xs->bucket,
                                                         xi->name);
                if (ret) {
                        mlog_errno(ret);
@@ -4667,17 +5161,21 @@ try_again:
                ret = ocfs2_add_new_xattr_bucket(inode,
                                                 xs->xattr_bh,
-                                                 xs->bucket.bhs[0]);
+                                                 xs->bucket,
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                for (i = 0; i < blk_per_bucket; i++)
+                /*
-                        brelse(xs->bucket.bhs[i]);
+                 * ocfs2_add_new_xattr_bucket() will have updated
+                 * xs->bucket if it moved, but it will not have updated
-                memset(&xs->bucket, 0, sizeof(xs->bucket));
+                 * any of the other search fields.  Thus, we drop it and
+                 * re-search.  Everything should be cached, so it'll be
+                 * quick.
+                 */
+                ocfs2_xattr_bucket_relse(xs->bucket);
                ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
                                                   xi->name_index,
                                                   xi->name, xs);
@@ -4689,7 +5187,7 @@ try_again:
        }
 xattr_set:
-        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
 out:
        mlog_exit(ret);
        return ret;
@@ -4700,24 +5198,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
                                        void *para)
 {
        int ret = 0;
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u16 i;
        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+        int credits = ocfs2_remove_extent_credits(osb->sb) +
+                ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
                xe = &xh->xh_entries[i];
                if (ocfs2_xattr_is_local(xe))
                        continue;
-                ret = ocfs2_xattr_bucket_value_truncate(inode,
+                ctxt.handle = ocfs2_start_trans(osb, credits);
-                                                        bucket->bhs[0],
+                if (IS_ERR(ctxt.handle)) {
-                                                        i, 0);
+                        ret = PTR_ERR(ctxt.handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+                                                        i, 0, &ctxt);
+                ocfs2_commit_trans(osb, ctxt.handle);
                if (ret) {
                        mlog_errno(ret);
                        break;
                }
        }
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
        return ret;
 }
@@ -4768,6 +5283,74 @@ out:
 }
 /*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+                                        size_t list_size, const char *name,
+                                        size_t name_len)
+{
+        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+                                    void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+                               buffer, size);
+}
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+                                    const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+                               size, flags);
+}
+int ocfs2_init_security_get(struct inode *inode,
+                            struct inode *dir,
+                            struct ocfs2_security_xattr_info *si)
+{
+        /* check whether ocfs2 support feature xattr */
+        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+                return -EOPNOTSUPP;
+        return security_inode_init_security(inode, dir, &si->name, &si->value,
+                                            &si->value_len);
+}
+int ocfs2_init_security_set(handle_t *handle,
+                            struct inode *inode,
+                            struct buffer_head *di_bh,
+                            struct ocfs2_security_xattr_info *si,
+                            struct ocfs2_alloc_context *xattr_ac,
+                            struct ocfs2_alloc_context *data_ac)
+{
+        return ocfs2_xattr_set_handle(handle, inode, di_bh,
+                                     OCFS2_XATTR_INDEX_SECURITY,
+                                     si->name, si->value, si->value_len, 0,
+                                     xattr_ac, data_ac);
+}
+struct xattr_handler ocfs2_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = ocfs2_xattr_security_list,
+        .get    = ocfs2_xattr_security_get,
+        .set    = ocfs2_xattr_security_set,
+};
+/*
 * 'trusted' attributes support
 */
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
        OCFS2_XATTR_MAX
 };
+struct ocfs2_security_xattr_info {
+        int enable;
+        char *name;
+        void *value;
+        size_t value_len;
+};
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+                           const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
                    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+                           int, const char *, const void *, size_t, int,
+                           struct ocfs2_alloc_context *,
+                           struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+                            struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+                            struct buffer_head *,
+                            struct ocfs2_security_xattr_info *,
+                            struct ocfs2_alloc_context *,
+                            struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+                             struct ocfs2_security_xattr_info *,
+                             int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+                          int, struct ocfs2_security_xattr_info *,
+                          int *, int *, struct ocfs2_alloc_context **);
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+        struct buffer_head              *vb_bh;
+        ocfs2_journal_access_func       vb_access;
+        struct ocfs2_xattr_value_root   *vb_xv;
+};
 #endif /* OCFS2_XATTR_H */
author	James Morris <jmorris@namei.org>	2009-02-05 19:01:45 -0500
committer	James Morris <jmorris@namei.org>	2009-02-05 19:01:45 -0500
commit	cb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree	7c06d8f30783115e3384721046258ce615b129c5 /fs/ocfs2
parent	8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parent	f01d1d546abb2f4028b5299092f529eefb01253a (diff)