310 files changed, 14956 insertions, 8192 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 047c791427aa..c061c3f18e7c 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -55,7 +55,7 @@ enum {
        Opt_err
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_debug, "debug=%x"},
        {Opt_dfltuid, "dfltuid=%u"},
        {Opt_dfltgid, "dfltgid=%u"},
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 88e3787c6ea9..e298fe194093 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 const struct file_operations v9fs_dir_operations = {
        .read = generic_read_dir,
+        .llseek = generic_file_llseek,
        .readdir = v9fs_dir_readdir,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c65045..e83aa5ebe861 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        return NULL;
 error:
-        if (fid)
+        p9_client_clunk(fid);
-                p9_client_clunk(fid);
        return ERR_PTR(result);
 }
diff --git a/fs/Kconfig b/fs/Kconfig
index d3873583360b..9e9d70c02a07 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
-config EXT4DEV_FS
+config EXT4_FS
-        tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
+        tristate "The Extended 4 (ext4) filesystem"
-        depends on EXPERIMENTAL
        select JBD2
        select CRC16
        help
-          Ext4dev is a predecessor filesystem of the next generation
+          This is the next generation of the ext3 filesystem.
-          extended fs ext4, based on ext3 filesystem code. It will be
-          renamed ext4 fs later, once ext4dev is mature and stabilized.
          Unlike the change from ext2 filesystem to ext3 filesystem,
-          the on-disk format of ext4dev is not the same as ext3 any more:
+          the on-disk format of ext4 is not forwards compatible with
-          it is based on extent maps and it supports 48-bit physical block
+          ext3; it is based on extent maps and it supports 48-bit
-          numbers. These combined on-disk format changes will allow
+          physical block numbers.  The ext4 filesystem also supports delayed
-          ext4dev/ext4 to handle more than 16 TB filesystem volumes --
+          allocation, persistent preallocation, high resolution time stamps,
-          a hard limit that ext3 cannot overcome without changing the
+          and a number of other features to improve performance and speed
-          on-disk format.
+          up fsck time.  For more information, please see the web pages at
+          http://ext4.wiki.kernel.org.
-          Other than extent maps and 48-bit block numbers, ext4dev also is
-          likely to have other new features such as persistent preallocation,
+          The ext4 filesystem will support mounting an ext3
-          high resolution time stamps, and larger file support etc.  These
+          filesystem; while there will be some performance gains from
-          features will be added to ext4dev gradually.
+          the delayed allocation and inode table readahead, the best
+          performance gains will require enabling ext4 features in the
+          filesystem, or formating a new filesystem as an ext4
+          filesystem initially.
          To compile this file system support as a module, choose M here. The
          module will be called ext4dev.
          If unsure, say N.
-config EXT4DEV_FS_XATTR
+config EXT4DEV_COMPAT
-        bool "Ext4dev extended attributes"
+        bool "Enable ext4dev compatibility"
-        depends on EXT4DEV_FS
+        depends on EXT4_FS
+        help
+          Starting with 2.6.28, the name of the ext4 filesystem was
+          renamed from ext4dev to ext4.  Unfortunately there are some
+          legacy userspace programs (such as klibc's fstype) have
+          "ext4dev" hardcoded.
+          To enable backwards compatibility so that systems that are
+          still expecting to mount ext4 filesystems using ext4dev,
+          chose Y here.   This feature will go away by 2.6.31, so
+          please arrange to get your userspace programs fixed!
+config EXT4_FS_XATTR
+        bool "Ext4 extended attributes"
+        depends on EXT4_FS
        default y
        help
          Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
          If unsure, say N.
-          You need this for POSIX ACL support on ext4dev/ext4.
+          You need this for POSIX ACL support on ext4.
-config EXT4DEV_FS_POSIX_ACL
+config EXT4_FS_POSIX_ACL
-        bool "Ext4dev POSIX Access Control Lists"
+        bool "Ext4 POSIX Access Control Lists"
-        depends on EXT4DEV_FS_XATTR
+        depends on EXT4_FS_XATTR
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
          If you don't know what Access Control Lists are, say N
-config EXT4DEV_FS_SECURITY
+config EXT4_FS_SECURITY
-        bool "Ext4dev Security Labels"
+        bool "Ext4 Security Labels"
-        depends on EXT4DEV_FS_XATTR
+        depends on EXT4_FS_XATTR
        help
          Security labels support alternative access control models
          implemented by security modules like SELinux.  This option
          enables an extended attribute handler for file security
-          labels in the ext4dev/ext4 filesystem.
+          labels in the ext4 filesystem.
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
@@ -206,17 +220,16 @@ config JBD
        tristate
        help
          This is a generic journalling layer for block devices.  It is
-          currently used by the ext3 and OCFS2 file systems, but it could
+          currently used by the ext3 file system, but it could also be
-          also be used to add journal support to other file systems or block
+          used to add journal support to other file systems or block
          devices such as RAID or LVM.
-          If you are using the ext3 or OCFS2 file systems, you need to
+          If you are using the ext3 file system, you need to say Y here.
-          say Y here. If you are not using ext3 OCFS2 then you will probably
+          If you are not using ext3 then you will probably want to say N.
-          want to say N.
          To compile this device as a module, choose M here: the module will be
-          called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
+          called jbd.  If you are compiling ext3 into the kernel, you
-          you cannot compile this code as a module.
+          cannot compile this code as a module.
 config JBD_DEBUG
        bool "JBD (ext3) debugging support"
@@ -240,22 +253,23 @@ config JBD2
        help
          This is a generic journaling layer for block devices that support
          both 32-bit and 64-bit block numbers.  It is currently used by
-          the ext4dev/ext4 filesystem, but it could also be used to add
+          the ext4 and OCFS2 filesystems, but it could also be used to add
          journal support to other file systems or block devices such
          as RAID or LVM.
-          If you are using ext4dev/ext4, you need to say Y here. If you are not
+          If you are using ext4 or OCFS2, you need to say Y here.
-          using ext4dev/ext4 then you will probably want to say N.
+          If you are not using ext4 or OCFS2 then you will
+          probably want to say N.
          To compile this device as a module, choose M here. The module will be
-          called jbd2.  If you are compiling ext4dev/ext4 into the kernel,
+          called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
          you cannot compile this code as a module.
 config JBD2_DEBUG
-        bool "JBD2 (ext4dev/ext4) debugging support"
+        bool "JBD2 (ext4) debugging support"
        depends on JBD2 && DEBUG_FS
        help
-          If you are using the ext4dev/ext4 journaled file system (or
+          If you are using the ext4 journaled file system (or
          potentially any other filesystem/device using JBD2), this option
          allows you to enable debugging output while the system is running,
          in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
        tristate
-        depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
+        depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
-        default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
+        default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
-        default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
+        default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
 config REISERFS_FS
        tristate "Reiserfs support"
@@ -419,6 +433,14 @@ config FS_POSIX_ACL
        bool
        default n
+config FILE_LOCKING
+        bool "Enable POSIX file locking API" if EMBEDDED
+        default y
+        help
+          This option enables standard file locking support, required
+          for filesystems like NFS and for the flock() system
+          call. Disabling this option saves about 11k.
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
@@ -426,7 +448,7 @@ config OCFS2_FS
        tristate "OCFS2 file system support"
        depends on NET && SYSFS
        select CONFIGFS_FS
-        select JBD
+        select JBD2
        select CRC32
        help
          OCFS2 is a general purpose extent based shared disk cluster file
@@ -497,6 +519,16 @@ config OCFS2_DEBUG_FS
          this option for debugging only as it is likely to decrease
          performance of the filesystem.
+config OCFS2_COMPAT_JBD
+        bool "Use JBD for compatibility"
+        depends on OCFS2_FS
+        default n
+        select JBD
+        help
+          The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
+          is backwards compatible with JBD.  It is safe to say N here.
+          However, if you really want to use the original JBD, say Y here.
 endif # BLOCK
 config DNOTIFY
@@ -1765,6 +1797,28 @@ config SUNRPC_XPRT_RDMA
          If unsure, say N.
+config SUNRPC_REGISTER_V4
+        bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
+        depends on SUNRPC && EXPERIMENTAL
+        default n
+        help
+          Sun added support for registering RPC services at an IPv6
+          address by creating two new versions of the rpcbind protocol
+          (RFC 1833).
+          This option enables support in the kernel RPC server for
+          registering kernel RPC services via version 4 of the rpcbind
+          protocol.  If you enable this option, you must run a portmapper
+          daemon that supports rpcbind protocol version 4.
+          Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+          requires that you enable this option and use a portmapper that
+          supports rpcbind version 4.
+          If unsure, say N to get traditional behavior (register kernel
+          RPC services using only rpcbind version 2).  Distributions
+          using the legacy Linux portmapper daemon must say N here.
 config RPCSEC_GSS_KRB5
        tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
        depends on SUNRPC && EXPERIMENTAL
@@ -1930,6 +1984,16 @@ config CIFS_WEAK_PW_HASH
          If unsure, say N.
+config CIFS_UPCALL
+          bool "Kerberos/SPNEGO advanced session setup"
+          depends on CIFS && KEYS
+          help
+            Enables an upcall mechanism for CIFS which accesses
+            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+            Kerberos tickets which are needed to mount to certain secure servers
+            (for which more secure Kerberos authentication is required). If
+            unsure, say N.
 config CIFS_XATTR
        bool "CIFS extended attributes"
        depends on CIFS
@@ -1982,17 +2046,6 @@ config CIFS_EXPERIMENTAL
            (which is disabled by default). See the file fs/cifs/README 
            for more details.  If unsure, say N.
-config CIFS_UPCALL
-          bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
-          depends on CIFS_EXPERIMENTAL
-          depends on KEYS
-          help
-            Enables an upcall mechanism for CIFS which accesses
-            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-            Kerberos tickets which are needed to mount to certain secure servers
-            (for which more secure Kerberos authentication is required). If
-            unsure, say N.
 config CIFS_DFS_UPCALL
          bool "DFS feature support (EXPERIMENTAL)"
          depends on CIFS_EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4a551af6f3fc..17c9c5ec14c5 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT
        help
          Support FLAT shared libraries
+config HAVE_AOUT
+       def_bool n
 config BINFMT_AOUT
        tristate "Kernel support for a.out and ECOFF binaries"
-        depends on ARCH_SUPPORTS_AOUT && \
+        depends on HAVE_AOUT
-                (X86_32 || ALPHA || ARM || M68K)
        ---help---
          A.out (Assembler.OUTput) is a set of formats for libraries and
          executables used in the earliest versions of UNIX.  Linux used
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff15..b6f27dc26b72 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,7 +7,7 @@
 obj-y :=        open.o read_write.o file_table.o super.o \
                char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
-                ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
+                ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)          += signalfd.o
 obj-$(CONFIG_TIMERFD)           += timerfd.o
 obj-$(CONFIG_EVENTFD)           += eventfd.o
+obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
 nfsd-$(CONFIG_NFSD)             := nfsctl.o
@@ -69,7 +70,7 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)       += reiserfs/
 obj-$(CONFIG_EXT3_FS)           += ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4DEV_FS)        += ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS)           += ext4/ # Before ext2 so root fs can be ext4dev
 obj-$(CONFIG_JBD)               += jbd/
 obj-$(CONFIG_JBD2)              += jbd2/
 obj-$(CONFIG_EXT2_FS)           += ext2/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fc1a8dc64d78..85a30e929800 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,6 +197,7 @@ out:
 const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
+        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
        .fsync          = file_fsync,
 };
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 26f3b43726bb..7f83a46f2b7e 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
 enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_ownmask, "ownmask=%o"},
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 6e3f282424b0..7b36904dbeac 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);
 const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
+        .llseek         = generic_file_llseek,
        .readdir        = affs_readdir,
        .fsync          = file_fsync,
 };
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3a89094f93d0..8989c93193ed 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -135,7 +135,7 @@ enum {
        Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_bs, "bs=%u"},
        {Opt_mode, "mode=%o"},
        {Opt_mufs, "mufs"},
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 250d8c4d66e4..aee239a048cb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -64,7 +64,7 @@ enum {
        afs_opt_vol,
 };
-static match_table_t afs_options_list = {
+static const match_table_t afs_options_list = {
        { afs_opt_cell,         "cell=%s"       },
        { afs_opt_rwpath,       "rwpath"        },
        { afs_opt_vol,          "vol=%s"        },
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index dda510d31f84..b70eea1e8c59 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = {
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-static match_table_t autofs_tokens = {
+static const match_table_t autofs_tokens = {
        {Opt_fd, "fd=%u"},
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7bb3e5ba0537..45d55819203d 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -213,7 +213,7 @@ static const struct super_operations autofs4_sops = {
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
        Opt_indirect, Opt_direct, Opt_offset};
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_fd, "fd=%u"},
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index bcfb2dc0a61b..2a41c2a7fc52 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = {
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
+        .llseek         = dcache_dir_lseek,
        .ioctl          = autofs4_root_ioctl,
 };
@@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = {
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
+        .llseek         = dcache_dir_lseek,
 };
 const struct inode_operations autofs4_indirect_root_inode_operations = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 02c6e62b72f8..9286b2af893a 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
 static const struct file_operations befs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = befs_readdir,
+        .llseek         = generic_file_llseek,
 };
 static const struct inode_operations befs_dir_inode_operations = {
@@ -649,7 +650,7 @@ enum {
        Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
 };
-static match_table_t befs_tokens = {
+static const match_table_t befs_tokens = {
        {Opt_uid, "uid=%d"},
        {Opt_gid, "gid=%d"},
        {Opt_charset, "iocharset=%s"},
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 87ee5ccee348..ed8feb052df9 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
                                                        inode->i_ino);
        if (err) {
                inode_dec_link_count(inode);
-                iput(inode);
                mutex_unlock(&info->bfs_lock);
+                iput(inode);
                return err;
        }
        mutex_unlock(&info->bfs_lock);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 56372ecf1690..dfc0197905ca 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -914,7 +914,9 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* Stash our initial stack pointer into the mm structure */
        current->mm->start_stack = (unsigned long )sp;
-        
+#ifdef FLAT_PLAT_INIT
+        FLAT_PLAT_INIT(regs);
+#endif
        DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
                (int)regs, (int)start_addr, (int)current->mm->start_stack);
        
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 756205314c24..8d7e88e02e0f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (bprm->misc_bang)
                goto _ret;
-        bprm->misc_bang = 1;
        /* to keep locking time low, we copy the interpreter string */
        read_lock(&entries_lock);
        fmt = check_file(bprm);
@@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (retval < 0)
                goto _error;
+        bprm->misc_bang = 1;
        retval = search_binary_handler (bprm, regs);
        if (retval < 0)
                goto _error;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe6..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
        BUG_ON(bip == NULL);
        /* A cloned bio doesn't own the integrity metadata */
-        if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+        if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
@@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bdev);
+        if (bi == NULL)
+                return 0;
+        if (rw == READ && bi->verify_fn != NULL &&
+            (bi->flags & INTEGRITY_FLAG_READ))
+                return 1;
+        if (rw == WRITE && bi->generate_fn != NULL &&
+            (bi->flags & INTEGRITY_FLAG_WRITE))
+                return 1;
+        return 0;
+}
 /**
 * bio_integrity_enabled - Check whether integrity can be passed
 * @bio:        bio to check
@@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio)
        }
 }
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+        if (bi)
+                return bi->tuple_size;
+        return 0;
+}
 /**
 * bio_integrity_prep - Prepare bio for integrity I/O
 * @bio:        bio to prepare
diff --git a/fs/bio.c b/fs/bio.c
index 8000e2fa16cb..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
 static struct kmem_cache *bio_slab __read_mostly;
-mempool_t *bio_split_pool __read_mostly;
+static mempool_t *bio_split_pool __read_mostly;
 /*
 * if you change this list, also change bvec_alloc or things will
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
        struct bio_vec *bvl;
        /*
-         * see comment near bvec_array define!
+         * If 'bs' is given, lookup the pool and do the mempool alloc.
+         * If not, this is a bio_kmalloc() allocation and just do a
+         * kzalloc() for the exact number of vecs right away.
         */
-        switch (nr) {
+        if (bs) {
-                case   1        : *idx = 0; break;
+                /*
-                case   2 ...   4: *idx = 1; break;
+                 * see comment near bvec_array define!
-                case   5 ...  16: *idx = 2; break;
+                 */
-                case  17 ...  64: *idx = 3; break;
+                switch (nr) {
-                case  65 ... 128: *idx = 4; break;
+                case 1:
-                case 129 ... BIO_MAX_PAGES: *idx = 5; break;
+                        *idx = 0;
+                        break;
+                case 2 ... 4:
+                        *idx = 1;
+                        break;
+                case 5 ... 16:
+                        *idx = 2;
+                        break;
+                case 17 ... 64:
+                        *idx = 3;
+                        break;
+                case 65 ... 128:
+                        *idx = 4;
+                        break;
+                case 129 ... BIO_MAX_PAGES:
+                        *idx = 5;
+                        break;
                default:
                        return NULL;
-        }
+                }
-        /*
-         * idx now points to the pool we want to allocate from
-         */
-        bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                /*
-        if (bvl)
+                 * idx now points to the pool we want to allocate from
-                memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+                 */
+                bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                if (bvl)
+                        memset(bvl, 0,
+                                bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+        } else
+                bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
        return bvl;
 }
@@ -107,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
        bio_free(bio, fs_bio_set);
 }
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+        kfree(bio->bi_io_vec);
+        kfree(bio);
+}
 void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
+        bio->bi_comp_cpu = -1;
        atomic_set(&bio->bi_cnt, 1);
 }
@@ -118,19 +146,25 @@ void bio_init(struct bio *bio)
 * bio_alloc_bioset - allocate a bio for I/O
 * @gfp_mask:   the GFP_ mask given to the slab allocator
 * @nr_iovecs:  number of iovecs to pre-allocate
- * @bs:         the bio_set to allocate from
+ * @bs:         the bio_set to allocate from. If %NULL, just use kmalloc
 *
 * Description:
- *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
+ *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
 *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
+ *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   fall back to just using @kmalloc to allocate the required memory.
 *
 *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure.
+ *   bio_set structure, or @kmalloc if none given.
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-        struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
+        struct bio *bio;
+        if (bs)
+                bio = mempool_alloc(bs->bio_pool, gfp_mask);
+        else
+                bio = kmalloc(sizeof(*bio), gfp_mask);
        if (likely(bio)) {
                struct bio_vec *bvl = NULL;
@@ -141,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                        bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
                        if (unlikely(!bvl)) {
-                                mempool_free(bio, bs->bio_pool);
+                                if (bs)
+                                        mempool_free(bio, bs->bio_pool);
+                                else
+                                        kfree(bio);
                                bio = NULL;
                                goto out;
                        }
@@ -164,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
        return bio;
 }
+/*
+ * Like bio_alloc(), but doesn't use a mempool backing. This means that
+ * it CAN fail, but while bio_alloc() can only be used for allocations
+ * that have a short (finite) life span, bio_kmalloc() should be used
+ * for more permanent bio allocations (like allocating some bio's for
+ * initalization or setup purposes).
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+        if (bio)
+                bio->bi_destructor = bio_kmalloc_destructor;
+        return bio;
+}
 void zero_fill_bio(struct bio *bio)
 {
        unsigned long flags;
@@ -208,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
        return bio->bi_phys_segments;
 }
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
-        if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-                blk_recount_segments(q, bio);
-        return bio->bi_hw_segments;
-}
 /**
 *      __bio_clone     -       clone a bio
 *      @bio: destination bio
@@ -350,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         */
        while (bio->bi_phys_segments >= q->max_phys_segments
-               || bio->bi_hw_segments >= q->max_hw_segments
+               || bio->bi_phys_segments >= q->max_hw_segments) {
-               || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
                if (retried_segments)
                        return 0;
@@ -395,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
        }
        /* If we may be able to merge these biovecs, force a recount */
-        if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
+        if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
-            BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
                bio->bi_flags &= ~(1 << BIO_SEG_VALID);
        bio->bi_vcnt++;
        bio->bi_phys_segments++;
-        bio->bi_hw_segments++;
 done:
        bio->bi_size += len;
        return len;
@@ -449,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 struct bio_map_data {
        struct bio_vec *iovecs;
-        int nr_sgvecs;
        struct sg_iovec *sgvecs;
+        int nr_sgvecs;
+        int is_our_pages;
 };
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-                             struct sg_iovec *iov, int iov_count)
+                             struct sg_iovec *iov, int iov_count,
+                             int is_our_pages)
 {
        memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
        memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
        bmd->nr_sgvecs = iov_count;
+        bmd->is_our_pages = is_our_pages;
        bio->bi_private = bmd;
 }
@@ -469,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)
        kfree(bmd);
 }
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
+        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
        if (!bmd)
                return NULL;
-        bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
+        bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
        if (!bmd->iovecs) {
                kfree(bmd);
                return NULL;
        }
-        bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+        bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
        if (bmd->sgvecs)
                return bmd;
@@ -491,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
        return NULL;
 }
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-                          int uncopy)
+                          struct sg_iovec *iov, int iov_count, int uncopy,
+                          int do_free_page)
 {
        int ret = 0, i;
        struct bio_vec *bvec;
@@ -502,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
        __bio_for_each_segment(bvec, bio, i, 0) {
                char *bv_addr = page_address(bvec->bv_page);
-                unsigned int bv_len = bvec->bv_len;
+                unsigned int bv_len = iovecs[i].bv_len;
                while (bv_len && iov_idx < iov_count) {
                        unsigned int bytes;
@@ -535,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
                        }
                }
-                if (uncopy)
+                if (do_free_page)
                        __free_page(bvec->bv_page);
        }
@@ -552,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
 int bio_uncopy_user(struct bio *bio)
 {
        struct bio_map_data *bmd = bio->bi_private;
-        int ret;
+        int ret = 0;
-        ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
+        if (!bio_flagged(bio, BIO_NULL_MAPPED))
+                ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+                                     bmd->nr_sgvecs, 1, bmd->is_our_pages);
        bio_free_map_data(bmd);
        bio_put(bio);
        return ret;
@@ -564,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
 /**
 *      bio_copy_user_iov       -       copy user data to bio
 *      @q: destination block queue
+ *      @map_data: pointer to the rq_map_data holding pages (if necessary)
 *      @iov:   the iovec.
 *      @iov_count: number of elements in the iovec
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Prepares and returns a bio for indirect user io, bouncing data
 *      to/from kernel pages as necessary. Must be paired with
 *      call bio_uncopy_user() on io completion.
 */
-struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
+struct bio *bio_copy_user_iov(struct request_queue *q,
-                              int iov_count, int write_to_vm)
+                              struct rq_map_data *map_data,
+                              struct sg_iovec *iov, int iov_count,
+                              int write_to_vm, gfp_t gfp_mask)
 {
        struct bio_map_data *bmd;
        struct bio_vec *bvec;
@@ -596,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
                len += iov[i].iov_len;
        }
-        bmd = bio_alloc_map_data(nr_pages, iov_count);
+        bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
        ret = -ENOMEM;
-        bio = bio_alloc(GFP_KERNEL, nr_pages);
+        bio = bio_alloc(gfp_mask, nr_pages);
        if (!bio)
                goto out_bmd;
        bio->bi_rw |= (!write_to_vm << BIO_RW);
        ret = 0;
+        i = 0;
        while (len) {
-                unsigned int bytes = PAGE_SIZE;
+                unsigned int bytes;
+                if (map_data)
+                        bytes = 1U << (PAGE_SHIFT + map_data->page_order);
+                else
+                        bytes = PAGE_SIZE;
                if (bytes > len)
                        bytes = len;
-                page = alloc_page(q->bounce_gfp | GFP_KERNEL);
+                if (map_data) {
+                        if (i == map_data->nr_entries) {
+                                ret = -ENOMEM;
+                                break;
+                        }
+                        page = map_data->pages[i++];
+                } else
+                        page = alloc_page(q->bounce_gfp | gfp_mask);
                if (!page) {
                        ret = -ENOMEM;
                        break;
@@ -633,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
         * success
         */
        if (!write_to_vm) {
-                ret = __bio_copy_iov(bio, iov, iov_count, 0);
+                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
                if (ret)
                        goto cleanup;
        }
-        bio_set_map_data(bmd, bio, iov, iov_count);
+        bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
        return bio;
 cleanup:
-        bio_for_each_segment(bvec, bio, i)
+        if (!map_data)
-                __free_page(bvec->bv_page);
+                bio_for_each_segment(bvec, bio, i)
+                        __free_page(bvec->bv_page);
        bio_put(bio);
 out_bmd:
@@ -653,29 +720,32 @@ out_bmd:
 /**
 *      bio_copy_user   -       copy user data to bio
 *      @q: destination block queue
+ *      @map_data: pointer to the rq_map_data holding pages (if necessary)
 *      @uaddr: start of user address
 *      @len: length in bytes
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Prepares and returns a bio for indirect user io, bouncing data
 *      to/from kernel pages as necessary. Must be paired with
 *      call bio_uncopy_user() on io completion.
 */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
-                          unsigned int len, int write_to_vm)
+                          unsigned long uaddr, unsigned int len,
+                          int write_to_vm, gfp_t gfp_mask)
 {
        struct sg_iovec iov;
        iov.iov_base = (void __user *)uaddr;
        iov.iov_len = len;
-        return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+        return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
 }
 static struct bio *__bio_map_user_iov(struct request_queue *q,
                                      struct block_device *bdev,
                                      struct sg_iovec *iov, int iov_count,
-                                      int write_to_vm)
+                                      int write_to_vm, gfp_t gfp_mask)
 {
        int i, j;
        int nr_pages = 0;
@@ -701,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
        if (!nr_pages)
                return ERR_PTR(-EINVAL);
-        bio = bio_alloc(GFP_KERNEL, nr_pages);
+        bio = bio_alloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
        ret = -ENOMEM;
-        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+        pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
        if (!pages)
                goto out;
@@ -785,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 *      @uaddr: start of user address
 *      @len: length in bytes
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Map the user space address into a bio suitable for io to a block
 *      device. Returns an error pointer in case of error.
 */
 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-                         unsigned long uaddr, unsigned int len, int write_to_vm)
+                         unsigned long uaddr, unsigned int len, int write_to_vm,
+                         gfp_t gfp_mask)
 {
        struct sg_iovec iov;
        iov.iov_base = (void __user *)uaddr;
        iov.iov_len = len;
-        return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+        return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
 }
 /**
@@ -807,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
 *      @iov:   the iovec.
 *      @iov_count: number of elements in the iovec
 *      @write_to_vm: bool indicating writing to pages or not
+ *      @gfp_mask: memory allocation flags
 *
 *      Map the user space address into a bio suitable for io to a block
 *      device. Returns an error pointer in case of error.
 */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
                             struct sg_iovec *iov, int iov_count,
-                             int write_to_vm)
+                             int write_to_vm, gfp_t gfp_mask)
 {
        struct bio *bio;
-        bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
+        bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+                                 gfp_mask);
        if (IS_ERR(bio))
                return bio;
@@ -942,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 {
        struct bio_vec *bvec;
        const int read = bio_data_dir(bio) == READ;
-        char *p = bio->bi_private;
+        struct bio_map_data *bmd = bio->bi_private;
        int i;
+        char *p = bmd->sgvecs[0].iov_base;
        __bio_for_each_segment(bvec, bio, i, 0) {
                char *addr = page_address(bvec->bv_page);
+                int len = bmd->iovecs[i].bv_len;
                if (read && !err)
-                        memcpy(p, addr, bvec->bv_len);
+                        memcpy(p, addr, len);
                __free_page(bvec->bv_page);
-                p += bvec->bv_len;
+                p += len;
        }
+        bio_free_map_data(bmd);
        bio_put(bio);
 }
@@ -972,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
                          gfp_t gfp_mask, int reading)
 {
-        unsigned long kaddr = (unsigned long)data;
-        unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        unsigned long start = kaddr >> PAGE_SHIFT;
-        const int nr_pages = end - start;
        struct bio *bio;
        struct bio_vec *bvec;
-        int i, ret;
+        int i;
-        bio = bio_alloc(gfp_mask, nr_pages);
-        if (!bio)
-                return ERR_PTR(-ENOMEM);
-        while (len) {
-                struct page *page;
-                unsigned int bytes = PAGE_SIZE;
-                if (bytes > len)
-                        bytes = len;
-                page = alloc_page(q->bounce_gfp | gfp_mask);
-                if (!page) {
-                        ret = -ENOMEM;
-                        goto cleanup;
-                }
-                if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
-                        ret = -EINVAL;
-                        goto cleanup;
-                }
-                len -= bytes;
+        bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
-        }
+        if (IS_ERR(bio))
+                return bio;
        if (!reading) {
                void *p = data;
@@ -1016,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
                }
        }
-        bio->bi_private = data;
        bio->bi_end_io = bio_copy_kern_endio;
-        return bio;
-cleanup:
-        bio_for_each_segment(bvec, bio, i)
-                __free_page(bvec->bv_page);
-        bio_put(bio);
-        return ERR_PTR(ret);
+        return bio;
 }
 /*
@@ -1212,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
 * split a bio - only worry about a bio with a single page
 * in it's iovec
 */
-struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
-        struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
+        struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
        if (!bp)
                return bp;
@@ -1248,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        bp->bio2.bi_end_io = bio_pair_end_2;
        bp->bio1.bi_private = bi;
-        bp->bio2.bi_private = pool;
+        bp->bio2.bi_private = bio_split_pool;
        if (bio_integrity(bi))
                bio_integrity_split(bi, bp, first_sectors);
@@ -1256,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        return bp;
 }
+/**
+ *      bio_sector_offset - Find hardware sector offset in bio
+ *      @bio:           bio to inspect
+ *      @index:         bio_vec index
+ *      @offset:        offset in bv_page
+ *
+ *      Return the number of hardware sectors between beginning of bio
+ *      and an end point indicated by a bio_vec index and an offset
+ *      within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+                           unsigned int offset)
+{
+        unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+        struct bio_vec *bv;
+        sector_t sectors;
+        int i;
+        sectors = 0;
+        if (index >= bio->bi_idx)
+                index = bio->bi_vcnt - 1;
+        __bio_for_each_segment(bv, bio, i, 0) {
+                if (i == index) {
+                        if (offset > bv->bv_offset)
+                                sectors += (offset - bv->bv_offset) / sector_sz;
+                        break;
+                }
+                sectors += bv->bv_len / sector_sz;
+        }
+        return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
 /*
 * create memory pools for biovec's in a bio_set.
@@ -1358,6 +1438,7 @@ static int __init init_bio(void)
 subsys_initcall(init_bio);
 EXPORT_SYMBOL(bio_alloc);
+EXPORT_SYMBOL(bio_kmalloc);
 EXPORT_SYMBOL(bio_put);
 EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
@@ -1365,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
 EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1375,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_copy_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_split_pool);
 EXPORT_SYMBOL(bio_copy_user);
 EXPORT_SYMBOL(bio_uncopy_user);
 EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e049..d84f0469a016 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
 *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
 */
-static struct kobject *bdev_get_kobj(struct block_device *bdev)
-{
-        if (bdev->bd_contains != bdev)
-                return kobject_get(&bdev->bd_part->dev.kobj);
-        else
-                return kobject_get(&bdev->bd_disk->dev.kobj);
-}
-static struct kobject *bdev_get_holder(struct block_device *bdev)
-{
-        if (bdev->bd_contains != bdev)
-                return kobject_get(bdev->bd_part->holder_dir);
-        else
-                return kobject_get(bdev->bd_disk->holder_dir);
-}
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
        if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
        if (!bo->hdev)
                goto fail_put_sdir;
-        bo->sdev = bdev_get_kobj(bdev);
+        bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
        if (!bo->sdev)
                goto fail_put_hdev;
-        bo->hdir = bdev_get_holder(bdev);
+        bo->hdir = kobject_get(bdev->bd_part->holder_dir);
        if (!bo->hdir)
                goto fail_put_sdev;
@@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 EXPORT_SYMBOL(open_by_devnum);
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev)
+{
+        if (__invalidate_device(bdev)) {
+                char name[BDEVNAME_SIZE] = "";
+                if (bdev->bd_disk)
+                        disk_name(bdev->bd_disk, 0, name);
+                printk(KERN_WARNING "VFS: busy inodes on changed media or "
+                       "resized disk %s\n", name);
+        }
+        if (!bdev->bd_disk)
+                return;
+        if (disk_partitionable(bdev->bd_disk))
+                bdev->bd_invalidated = 1;
+}
+/**
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+        loff_t disk_size, bdev_size;
+        disk_size = (loff_t)get_capacity(disk) << 9;
+        bdev_size = i_size_read(bdev->bd_inode);
+        if (disk_size != bdev_size) {
+                char name[BDEVNAME_SIZE];
+                disk_name(disk, 0, name);
+                printk(KERN_INFO
+                       "%s: detected capacity change from %lld to %lld\n",
+                       name, bdev_size, disk_size);
+                i_size_write(bdev->bd_inode, disk_size);
+                flush_disk(bdev);
+        }
+}
+EXPORT_SYMBOL(check_disk_size_change);
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+        struct block_device *bdev;
+        int ret = 0;
+        if (disk->fops->revalidate_disk)
+                ret = disk->fops->revalidate_disk(disk);
+        bdev = bdget_disk(disk, 0);
+        if (!bdev)
+                return ret;
+        mutex_lock(&bdev->bd_mutex);
+        check_disk_size_change(disk, bdev);
+        mutex_unlock(&bdev->bd_mutex);
+        bdput(bdev);
+        return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
 /*
 * This routine checks whether a removable media has been changed,
 * and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev)
        if (!bdops->media_changed(bdev->bd_disk))
                return 0;
-        if (__invalidate_device(bdev))
+        flush_disk(bdev);
-                printk("VFS: busy inodes on changed media.\n");
        if (bdops->revalidate_disk)
                bdops->revalidate_disk(bdev->bd_disk);
-        if (bdev->bd_disk->minors > 1)
-                bdev->bd_invalidated = 1;
        return 1;
 }
@@ -927,10 +988,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
 static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
-        struct module *owner = NULL;
        struct gendisk *disk;
+        struct hd_struct *part = NULL;
        int ret;
-        int part;
+        int partno;
        int perm = 0;
        if (file->f_mode & FMODE_READ)
@@ -948,25 +1009,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
        ret = -ENXIO;
        file->f_mapping = bdev->bd_inode->i_mapping;
        lock_kernel();
-        disk = get_gendisk(bdev->bd_dev, &part);
-        if (!disk) {
+        disk = get_gendisk(bdev->bd_dev, &partno);
-                unlock_kernel();
+        if (!disk)
-                bdput(bdev);
+                goto out_unlock_kernel;
-                return ret;
+        part = disk_get_part(disk, partno);
-        }
+        if (!part)
-        owner = disk->fops->owner;
+                goto out_unlock_kernel;
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
+                bdev->bd_part = part;
                bdev->bd_contains = bdev;
-                if (!part) {
+                if (!partno) {
                        struct backing_dev_info *bdi;
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev->bd_inode, file);
                                if (ret)
-                                        goto out_first;
+                                        goto out_clear;
                        }
                        if (!bdev->bd_openers) {
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -978,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
                        if (bdev->bd_invalidated)
                                rescan_partitions(disk, bdev);
                } else {
-                        struct hd_struct *p;
                        struct block_device *whole;
                        whole = bdget_disk(disk, 0);
                        ret = -ENOMEM;
                        if (!whole)
-                                goto out_first;
+                                goto out_clear;
                        BUG_ON(for_part);
                        ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
                        if (ret)
-                                goto out_first;
+                                goto out_clear;
                        bdev->bd_contains = whole;
-                        p = disk->part[part - 1];
                        bdev->bd_inode->i_data.backing_dev_info =
                           whole->bd_inode->i_data.backing_dev_info;
-                        if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+                        if (!(disk->flags & GENHD_FL_UP) ||
+                            !part || !part->nr_sects) {
                                ret = -ENXIO;
-                                goto out_first;
+                                goto out_clear;
                        }
-                        kobject_get(&p->dev.kobj);
+                        bd_set_size(bdev, (loff_t)part->nr_sects << 9);
-                        bdev->bd_part = p;
-                        bd_set_size(bdev, (loff_t) p->nr_sects << 9);
                }
        } else {
+                disk_put_part(part);
                put_disk(disk);
-                module_put(owner);
+                module_put(disk->fops->owner);
+                part = NULL;
+                disk = NULL;
                if (bdev->bd_contains == bdev) {
                        if (bdev->bd_disk->fops->open) {
                                ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
                                if (ret)
-                                        goto out;
+                                        goto out_unlock_bdev;
                        }
                        if (bdev->bd_invalidated)
                                rescan_partitions(bdev->bd_disk, bdev);
@@ -1020,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
        unlock_kernel();
        return 0;
-out_first:
+ out_clear:
        bdev->bd_disk = NULL;
+        bdev->bd_part = NULL;
        bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, 1);
        bdev->bd_contains = NULL;
-        put_disk(disk);
+ out_unlock_bdev:
-        module_put(owner);
-out:
        mutex_unlock(&bdev->bd_mutex);
+ out_unlock_kernel:
        unlock_kernel();
-        if (ret)
-                bdput(bdev);
+        disk_put_part(part);
+        if (disk)
+                module_put(disk->fops->owner);
+        put_disk(disk);
+        bdput(bdev);
        return ret;
 }
@@ -1117,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
                put_disk(disk);
                module_put(owner);
+                disk_put_part(bdev->bd_part);
-                if (bdev->bd_contains != bdev) {
+                bdev->bd_part = NULL;
-                        kobject_put(&bdev->bd_part->dev.kobj);
-                        bdev->bd_part = NULL;
-                }
                bdev->bd_disk = NULL;
                bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
                if (bdev != bdev->bd_contains)
@@ -1197,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 /**
 * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:   special file representing the block device
 *
- * @path:       special file representing the block device
+ * Get a reference to the blockdevice at @pathname in the current
- *
- * Get a reference to the blockdevice at @path in the current
 * namespace if possible and return it.  Return ERR_PTR(error)
 * otherwise.
 */
diff --git a/fs/buffer.c b/fs/buffer.c
index 38653e36e225..ac78d4c19b3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
-        if (buffer_ordered(bh) && (rw == WRITE))
+        /*
-                rw = WRITE_BARRIER;
+         * Mask in barrier bit for a write (could be either a WRITE or a
+         * WRITE_SYNC
+         */
+        if (buffer_ordered(bh) && (rw & WRITE))
+                rw |= WRITE_BARRIER;
        /*
-         * Only clear out a write error when rewriting, should this
+         * Only clear out a write error when rewriting
-         * include WRITE_SYNC as well?
         */
-        if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
+        if (test_set_buffer_req(bh) && (rw & WRITE))
                clear_buffer_write_io_error(bh);
        /*
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f5d0083e09fa..06e521a945c3 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,15 @@ Fix premature write failure on congested networks (we would give up
 on EAGAIN from the socket too quickly on large writes).
 Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
 Fix endian problems in acl (mode from/to cifs acl) on bigendian
-architectures.
+architectures.  Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba).  Fix memory leak
+on dns_upcall (resolving DFS referralls).  Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though).  Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
 Version 1.53
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 2bd6fe556f88..bd2343d4c6a6 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -542,10 +542,20 @@ SecurityFlags		Flags which control security negotiation and
                        hashing mechanisms (as "must use") on the other hand 
                        does not make much sense. Default flags are 
                                0x07007 
-                        (NTLM, NTLMv2 and packet signing allowed).  Maximum 
+                        (NTLM, NTLMv2 and packet signing allowed).  The maximum 
                        allowable flags if you want to allow mounts to servers
                        using weaker password hashes is 0x37037 (lanman,
-                        plaintext, ntlm, ntlmv2, signing allowed):
+                        plaintext, ntlm, ntlmv2, signing allowed).  Some
+                        SecurityFlags require the corresponding menuconfig
+                        options to be enabled (lanman and plaintext require
+                        CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
+                        plaintext authentication currently requires also
+                        enabling lanman authentication in the security flags
+                        because the cifs module only supports sending
+                        laintext passwords using the older lanman dialect
+                        form of the session setup SMB.  (e.g. for authentication
+                        using plain text passwords, set the SecurityFlags
+                        to 0x30030):
 
                        may use packet signing                          0x00001
                        must use packet signing                         0x01001
@@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in
 that they represent all for that share, not just those for which the server
 returned success.
        
-Also note that "cat /proc/fs/cifs/DebugData" will display information about 
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
 the active sessions and the shares that are mounted.
-Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
-on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
+Enabling Kerberos (extended security) works but requires version 1.2 or later
-LANMAN support do not require this helper.
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+Enabling DFS support (used to access shares transparently in an MS-DFS
+global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In
+addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 5fabd2caf93c..1b09f1670061 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        unsigned int cls, con, tag, oidlen, rc;
        bool use_ntlmssp = false;
        bool use_kerberos = false;
+        bool use_mskerberos = false;
        *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
@@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                                         *(oid + 1), *(oid + 2), *(oid + 3)));
                                if (compare_oid(oid, oidlen, MSKRB5_OID,
-                                                MSKRB5_OID_LEN))
+                                                MSKRB5_OID_LEN) &&
-                                        use_kerberos = true;
+                                                !use_kerberos)
+                                        use_mskerberos = true;
                                else if (compare_oid(oid, oidlen, KRB5_OID,
-                                                     KRB5_OID_LEN))
+                                                     KRB5_OID_LEN) &&
+                                                     !use_mskerberos)
                                        use_kerberos = true;
                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
                                                     NTLMSSP_OID_LEN))
@@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        if (use_kerberos)
                *secType = Kerberos;
+        else if (use_mskerberos)
+                *secType = MSKerberos;
        else if (use_ntlmssp)
                *secType = NTLMSSP;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2434ab0e8791..fcee9298b620 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
        .describe       = user_describe,
 };
-#define MAX_VER_STR_LEN   8 /* length of longest version string e.g.
+/* length of longest version string e.g.  strlen("ver=0xFF") */
-                                strlen("ver=0xFF") */
+#define MAX_VER_STR_LEN         8
-#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
-                               in future could have strlen(";sec=ntlmsspi") */
+/* length of longest security mechanism name, eg in future could have
-#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+ * strlen(";sec=ntlmsspi") */
+#define MAX_MECH_STR_LEN        13
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+#define MAX_IPV6_ADDR_LEN       42
+/* strlen of "host=" */
+#define HOST_KEY_LEN            5
+/* strlen of ";ip4=" or ";ip6=" */
+#define IP_KEY_LEN              5
+/* strlen of ";uid=0x" */
+#define UID_KEY_LEN             7
+/* strlen of ";user=" */
+#define USER_KEY_LEN            6
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -84,11 +101,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        /* length of fields (with semicolons): ver=0xyz ip4=ipaddress
           host=hostname sec=mechanism uid=0xFF user=username */
        desc_len = MAX_VER_STR_LEN +
-                   6 /* len of "host=" */ + strlen(hostname) +
+                   HOST_KEY_LEN + strlen(hostname) +
-                   5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN +
+                   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
                   MAX_MECH_STR_LEN +
-                   7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) +
+                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
-                   6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1;
+                   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
        spnego_key = ERR_PTR(-ENOMEM);
        description = kzalloc(desc_len, GFP_KERNEL);
@@ -114,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
-        /* for now, only sec=krb5 is valid */
+        /* for now, only sec=krb5 and sec=mskrb5 are valid */
        if (server->secType == Kerberos)
                sprintf(dp, ";sec=krb5");
+        else if (server->secType == MSKerberos)
+                sprintf(dp, ";sec=mskrb5");
        else
                goto out;
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 05a34b17a1ab..e4041ec4d712 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -23,7 +23,7 @@
 #ifndef _CIFS_SPNEGO_H
 #define _CIFS_SPNEGO_H
-#define CIFS_SPNEGO_UPCALL_VERSION 1
+#define CIFS_SPNEGO_UPCALL_VERSION 2
 /*
 * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 83fd40dc1ef0..bd5f13d38450 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
        if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
                if (extended_security & CIFSSEC_MAY_PLNTXT) {
+                        memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                        memcpy(lnm_session_key, password_with_pad,
                                CIFS_ENCPWD_SIZE);
                        return;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e8da4ee761b5..25ecbd5b0404 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -175,6 +175,8 @@ out_no_root:
        if (inode)
                iput(inode);
+        cifs_umount(sb, cifs_sb);
 out_mount_failed:
        if (cifs_sb) {
 #ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 135c965c4137..f7b4a5cd837b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
                       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
                                  struct nameidata *);
-extern int cifs_unlink(struct inode *, struct dentry *);
+extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, int);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7e1cf262effe..0d22479d99b7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -80,7 +80,8 @@ enum securityEnum {
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO */
        NTLMSSP,                /* NTLMSSP via SPNEGO */
-        Kerberos                /* Kerberos via SPNEGO */
+        Kerberos,               /* Kerberos via SPNEGO */
+        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
 enum protocolEnum {
@@ -308,6 +309,7 @@ struct cifs_search_info {
        __u32 resume_key;
        char *ntwrk_buf_start;
        char *srch_entries_start;
+        char *last_entry;
        char *presume_name;
        unsigned int resume_name_len;
        bool endOfSearch:1;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index a729d083e6f4..0cff7fe986e8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -179,6 +179,8 @@ extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
                        const FILE_BASIC_INFO *data, __u16 fid,
                        __u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+                        bool delete_file, __u16 fid, __u32 pid_of_opener);
 #if 0
 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
                        char *fileName, __u16 dos_attributes,
@@ -229,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-                        int netfid, char *target_name,
+                        int netfid, const char *target_name,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
 extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 994de7c90474..6f4ffe15d68d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2017,7 +2017,7 @@ renameRetry:
 }
 int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-                int netfid, char *target_name,
+                int netfid, const char *target_name,
                const struct nls_table *nls_codepage, int remap)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -2071,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
                                        remap);
        }
        rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
-        count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2;
+        count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
        byte_count += count;
        pSMB->DataCount = cpu_to_le16(count);
        pSMB->TotalDataCount = pSMB->DataCount;
@@ -3614,6 +3614,8 @@ findFirstRetry:
                /* BB remember to free buffer if error BB */
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc == 0) {
+                        unsigned int lnoff;
                        if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
                                psrch_inf->unicode = true;
                        else
@@ -3636,6 +3638,17 @@ findFirstRetry:
                                        le16_to_cpu(parms->SearchCount);
                        psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
                                psrch_inf->entries_in_buffer;
+                        lnoff = le16_to_cpu(parms->LastNameOffset);
+                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+                              lnoff) {
+                                cERROR(1, ("ignoring corrupt resume name"));
+                                psrch_inf->last_entry = NULL;
+                                return rc;
+                        }
+                        psrch_inf->last_entry = psrch_inf->srch_entries_start +
+                                                        lnoff;
                        *pnetfid = parms->SearchHandle;
                } else {
                        cifs_buf_release(pSMB);
@@ -3725,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                rc = validate_t2((struct smb_t2_rsp *)pSMBr);
                if (rc == 0) {
+                        unsigned int lnoff;
                        /* BB fixme add lock for file (srch_info) struct here */
                        if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
                                psrch_inf->unicode = true;
@@ -3751,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
                                                le16_to_cpu(parms->SearchCount);
                        psrch_inf->index_of_last_entry +=
                                psrch_inf->entries_in_buffer;
+                        lnoff = le16_to_cpu(parms->LastNameOffset);
+                        if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+                              lnoff) {
+                                cERROR(1, ("ignoring corrupt resume name"));
+                                psrch_inf->last_entry = NULL;
+                                return rc;
+                        } else
+                                psrch_inf->last_entry =
+                                        psrch_inf->srch_entries_start + lnoff;
 /*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
            psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
@@ -4876,6 +4901,61 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
        return rc;
 }
+int
+CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+                          bool delete_file, __u16 fid, __u32 pid_of_opener)
+{
+        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+        char *data_offset;
+        int rc = 0;
+        __u16 params, param_offset, offset, byte_count, count;
+        cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+        rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+        if (rc)
+                return rc;
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+        params = 6;
+        pSMB->MaxSetupCount = 0;
+        pSMB->Reserved = 0;
+        pSMB->Flags = 0;
+        pSMB->Timeout = 0;
+        pSMB->Reserved2 = 0;
+        param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+        offset = param_offset + params;
+        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+        count = 1;
+        pSMB->MaxParameterCount = cpu_to_le16(2);
+        /* BB find max SMB PDU from sess */
+        pSMB->MaxDataCount = cpu_to_le16(1000);
+        pSMB->SetupCount = 1;
+        pSMB->Reserved3 = 0;
+        pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+        byte_count = 3 /* pad */  + params + count;
+        pSMB->DataCount = cpu_to_le16(count);
+        pSMB->ParameterCount = cpu_to_le16(params);
+        pSMB->TotalDataCount = pSMB->DataCount;
+        pSMB->TotalParameterCount = pSMB->ParameterCount;
+        pSMB->ParameterOffset = cpu_to_le16(param_offset);
+        pSMB->DataOffset = cpu_to_le16(offset);
+        pSMB->Fid = fid;
+        pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
+        pSMB->Reserved4 = 0;
+        pSMB->hdr.smb_buf_length += byte_count;
+        pSMB->ByteCount = cpu_to_le16(byte_count);
+        *data_offset = delete_file ? 1 : 0;
+        rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+        if (rc)
+                cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+        return rc;
+}
 int
 CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0711db65afe8..4c13bcdb92a5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
        bool ntlmv2_flag = false;
        int first_time = 0;
+        struct TCP_Server_Info *server = pSesInfo->server;
        /* what if server changes its buffer size after dropping the session? */
-        if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ {
+        if (server->maxBuf == 0) /* no need to send on reconnect */ {
                rc = CIFSSMBNegotiate(xid, pSesInfo);
-                if (rc == -EAGAIN) /* retry only once on 1st time connection */ {
+                if (rc == -EAGAIN) {
+                        /* retry only once on 1st time connection */
                        rc = CIFSSMBNegotiate(xid, pSesInfo);
                        if (rc == -EAGAIN)
                                rc = -EHOSTDOWN;
                }
                if (rc == 0) {
                        spin_lock(&GlobalMid_Lock);
-                        if (pSesInfo->server->tcpStatus != CifsExiting)
+                        if (server->tcpStatus != CifsExiting)
-                                pSesInfo->server->tcpStatus = CifsGood;
+                                server->tcpStatus = CifsGood;
                        else
                                rc = -EHOSTDOWN;
                        spin_unlock(&GlobalMid_Lock);
@@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                goto ss_err_exit;
        pSesInfo->flags = 0;
-        pSesInfo->capabilities = pSesInfo->server->capabilities;
+        pSesInfo->capabilities = server->capabilities;
        if (linuxExtEnabled == 0)
                pSesInfo->capabilities &= (~CAP_UNIX);
        /*      pSesInfo->sequence_number = 0;*/
        cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                 pSesInfo->server->secMode,
+                 server->secMode, server->capabilities, server->timeAdj));
-                 pSesInfo->server->capabilities,
-                 pSesInfo->server->timeAdj));
        if (experimEnabled < 2)
                rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
        else if (extended_security
                        && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-                        && (pSesInfo->server->secType == NTLMSSP)) {
+                        && (server->secType == NTLMSSP)) {
                rc = -EOPNOTSUPP;
        } else if (extended_security
                        && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-                        && (pSesInfo->server->secType == RawNTLMSSP)) {
+                        && (server->secType == RawNTLMSSP)) {
                cFYI(1, ("NTLMSSP sesssetup"));
                rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
                                                   nls_info);
@@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                        } else {
                                SMBNTencrypt(pSesInfo->password,
-                                             pSesInfo->server->cryptKey,
+                                             server->cryptKey,
                                             ntlm_session_key);
                                if (first_time)
                                        cifs_calculate_mac_key(
-                                             &pSesInfo->server->mac_signing_key,
+                                             &server->mac_signing_key,
                                             ntlm_session_key,
                                             pSesInfo->password);
                        }
@@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                                                      nls_info);
                }
        } else { /* old style NTLM 0.12 session setup */
-                SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey,
+                SMBNTencrypt(pSesInfo->password, server->cryptKey,
                             ntlm_session_key);
                if (first_time)
-                        cifs_calculate_mac_key(
+                        cifs_calculate_mac_key(&server->mac_signing_key,
-                                        &pSesInfo->server->mac_signing_key,
+                                                ntlm_session_key,
-                                        ntlm_session_key, pSesInfo->password);
+                                                pSesInfo->password);
                rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
        }
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index f730ef35499e..1e0c1bd8f2e4 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,38 +29,13 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
-static int dns_resolver_instantiate(struct key *key, const void *data,
-                size_t datalen)
-{
-        int rc = 0;
-        char *ip;
-        ip = kmalloc(datalen+1, GFP_KERNEL);
-        if (!ip)
-                return -ENOMEM;
-        memcpy(ip, data, datalen);
-        ip[datalen] = '\0';
-        rcu_assign_pointer(key->payload.data, ip);
-        return rc;
-}
-struct key_type key_type_dns_resolver = {
-        .name        = "dns_resolver",
-        .def_datalen = sizeof(struct in_addr),
-        .describe    = user_describe,
-        .instantiate = dns_resolver_instantiate,
-        .match       = user_match,
-};
 /* Checks if supplied name is IP address
 * returns:
 *              1 - name is IP
 *              0 - name is not IP
 */
-static int is_ip(const char *name)
+static int
+is_ip(const char *name)
 {
        int rc;
        struct sockaddr_in sin_server;
@@ -82,6 +57,47 @@ static int is_ip(const char *name)
        return 0;
 }
+static int
+dns_resolver_instantiate(struct key *key, const void *data,
+                size_t datalen)
+{
+        int rc = 0;
+        char *ip;
+        ip = kmalloc(datalen + 1, GFP_KERNEL);
+        if (!ip)
+                return -ENOMEM;
+        memcpy(ip, data, datalen);
+        ip[datalen] = '\0';
+        /* make sure this looks like an address */
+        if (!is_ip((const char *) ip)) {
+                kfree(ip);
+                return -EINVAL;
+        }
+        key->type_data.x[0] = datalen;
+        rcu_assign_pointer(key->payload.data, ip);
+        return rc;
+}
+static void
+dns_resolver_destroy(struct key *key)
+{
+        kfree(key->payload.data);
+}
+struct key_type key_type_dns_resolver = {
+        .name        = "dns_resolver",
+        .def_datalen = sizeof(struct in_addr),
+        .describe    = user_describe,
+        .instantiate = dns_resolver_instantiate,
+        .destroy     = dns_resolver_destroy,
+        .match       = user_match,
+};
 /* Resolves server name to ip address.
 * input:
 *      unc - server UNC
@@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
        rkey = request_key(&key_type_dns_resolver, name, "");
        if (!IS_ERR(rkey)) {
+                len = rkey->type_data.x[0];
                data = rkey->payload.data;
        } else {
                cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 skip_upcall:
        if (data) {
-                len = strlen(data);
+                *ip_addr = kmalloc(len + 1, GFP_KERNEL);
-                *ip_addr = kmalloc(len+1, GFP_KERNEL);
                if (*ip_addr) {
-                        memcpy(*ip_addr, data, len);
+                        memcpy(*ip_addr, data, len + 1);
-                        (*ip_addr)[len] = '\0';
                        if (!IS_ERR(rkey))
                                cFYI(1, ("%s: resolved: %s to %s", __func__,
                                                        name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ff14d14903a0..c4a8a0605125 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        /* want handles we can use to read with first
           in the list so we do not have to walk the
-           list to search for one in prepare_write */
+           list to search for one in write_begin */
        if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
                list_add_tail(&pCifsFile->flist,
                              &pCifsInode->openFileList);
@@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                return -EBADF;
        open_file = (struct cifsFileInfo *) file->private_data;
+        rc = generic_write_checks(file, poffset, &write_size, 0);
+        if (rc)
+                return rc;
        xid = GetXid();
        if (*poffset > file->f_path.dentry->d_inode->i_size)
@@ -911,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 }
 static ssize_t cifs_write(struct file *file, const char *write_data,
-        size_t write_size, loff_t *poffset)
+                          size_t write_size, loff_t *poffset)
 {
        int rc = 0;
        unsigned int bytes_written = 0;
@@ -1061,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 {
        struct cifsFileInfo *open_file;
+        bool any_available = false;
        int rc;
        /* Having a null inode here (because mapping->host was set to zero by
@@ -1076,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
        read_lock(&GlobalSMBSeslock);
 refind_writable:
        list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-                if (open_file->closePend)
+                if (open_file->closePend ||
+                    (!any_available && open_file->pid != current->tgid))
                        continue;
                if (open_file->pfile &&
                    ((open_file->pfile->f_flags & O_RDWR) ||
                     (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1127,6 +1134,11 @@ refind_writable:
                           of the loop here. */
                }
        }
+        /* couldn't find useable FH with same pid, try any available */
+        if (!any_available) {
+                any_available = true;
+                goto refind_writable;
+        }
        read_unlock(&GlobalSMBSeslock);
        return NULL;
 }
@@ -1443,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
        return rc;
 }
-static int cifs_commit_write(struct file *file, struct page *page,
+static int cifs_write_end(struct file *file, struct address_space *mapping,
-        unsigned offset, unsigned to)
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
 {
-        int xid;
+        int rc;
-        int rc = 0;
+        struct inode *inode = mapping->host;
-        struct inode *inode = page->mapping->host;
-        loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-        char *page_data;
-        xid = GetXid();
+        cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
-        cFYI(1, ("commit write for page %p up to position %lld for %d",
+                 page, pos, copied));
-                 page, position, to));
-        spin_lock(&inode->i_lock);
+        if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
-        if (position > inode->i_size)
+                SetPageUptodate(page);
-                i_size_write(inode, position);
-        spin_unlock(&inode->i_lock);
        if (!PageUptodate(page)) {
-                position =  ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
+                char *page_data;
-                /* can not rely on (or let) writepage write this data */
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-                if (to < offset) {
+                int xid;
-                        cFYI(1, ("Illegal offsets, can not copy from %d to %d",
-                                offset, to));
+                xid = GetXid();
-                        FreeXid(xid);
-                        return rc;
-                }
                /* this is probably better than directly calling
                   partialpage_write since in this function the file handle is
                   known which we might as well leverage */
                /* BB check if anything else missing out of ppw
                   such as updating last write time */
                page_data = kmap(page);
-                rc = cifs_write(file, page_data + offset, to-offset,
+                rc = cifs_write(file, page_data + offset, copied, &pos);
-                                &position);
+                /* if (rc < 0) should we set writebehind rc? */
-                if (rc > 0)
-                        rc = 0;
-                /* else if (rc < 0) should we set writebehind rc? */
                kunmap(page);
+                FreeXid(xid);
        } else {
+                rc = copied;
+                pos += copied;
                set_page_dirty(page);
        }
-        FreeXid(xid);
+        if (rc > 0) {
+                spin_lock(&inode->i_lock);
+                if (pos > inode->i_size)
+                        i_size_write(inode, pos);
+                spin_unlock(&inode->i_lock);
+        }
+        unlock_page(page);
+        page_cache_release(page);
        return rc;
 }
@@ -2031,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
                return true;
 }
-static int cifs_prepare_write(struct file *file, struct page *page,
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
-        unsigned from, unsigned to)
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
 {
-        int rc = 0;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        loff_t i_size;
+        loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
-        loff_t offset;
+        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
-        cFYI(1, ("prepare write for page %p from %d to %d", page, from, to));
+        *pagep = __grab_cache_page(mapping, index);
-        if (PageUptodate(page))
+        if (!*pagep)
+                return -ENOMEM;
+        if (PageUptodate(*pagep))
                return 0;
        /* If we are writing a full page it will be up to date,
           no need to read from the server */
-        if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
+        if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
-                SetPageUptodate(page);
                return 0;
-        }
-        offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
-        i_size = i_size_read(page->mapping->host);
+                int rc;
-        if ((offset >= i_size) ||
-            ((from == 0) && (offset + to) >= i_size)) {
-                /*
-                 * We don't need to read data beyond the end of the file.
-                 * zero it, and set the page uptodate
-                 */
-                simple_prepare_write(file, page, from, to);
-                SetPageUptodate(page);
-        } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
                /* might as well read a page, it is fast enough */
-                rc = cifs_readpage_worker(file, page, &offset);
+                rc = cifs_readpage_worker(file, *pagep, &offset);
+                /* we do not need to pass errors back
+                   e.g. if we do not have read access to the file
+                   because cifs_write_end will attempt synchronous writes
+                   -- shaggy */
        } else {
                /* we could try using another file handle if there is one -
                   but how would we lock it to prevent close of that handle
                   racing with this read? In any case
-                   this will be written out by commit_write so is fine */
+                   this will be written out by write_end so is fine */
        }
-        /* we do not need to pass errors back
-           e.g. if we do not have read access to the file
-           because cifs_commit_write will do the right thing.  -- shaggy */
        return 0;
 }
@@ -2082,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
        .readpages = cifs_readpages,
        .writepage = cifs_writepage,
        .writepages = cifs_writepages,
-        .prepare_write = cifs_prepare_write,
+        .write_begin = cifs_write_begin,
-        .commit_write = cifs_commit_write,
+        .write_end = cifs_write_end,
        .set_page_dirty = __set_page_dirty_nobuffers,
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
@@ -2098,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
        .readpage = cifs_readpage,
        .writepage = cifs_writepage,
        .writepages = cifs_writepages,
-        .prepare_write = cifs_prepare_write,
+        .write_begin = cifs_write_begin,
-        .commit_write = cifs_commit_write,
+        .write_end = cifs_write_end,
        .set_page_dirty = __set_page_dirty_nobuffers,
        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 28a22092d450..a8c833345fc9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode,
                if ((inode->i_mode & S_IWUGO) == 0 &&
                    (attr & ATTR_READONLY) == 0)
                        inode->i_mode |= (S_IWUGO & default_mode);
-                        inode->i_mode &= ~S_IFMT;
+                inode->i_mode &= ~S_IFMT;
        }
        /* clear write bits if ATTR_READONLY is set */
        if (attr & ATTR_READONLY)
@@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                inode->i_fop = &simple_dir_operations;
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
+        } else if (rc) {
                _FreeXid(xid);
                iget_failed(inode);
                return ERR_PTR(rc);
@@ -663,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
        return inode;
 }
-int cifs_unlink(struct inode *inode, struct dentry *direntry)
+static int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
+                    char *full_path, __u32 dosattr)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid;
+        __u32 netpid;
+        bool set_time = false;
+        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        FILE_BASIC_INFO info_buf;
+        if (attrs->ia_valid & ATTR_ATIME) {
+                set_time = true;
+                info_buf.LastAccessTime =
+                        cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+        } else
+                info_buf.LastAccessTime = 0;
+        if (attrs->ia_valid & ATTR_MTIME) {
+                set_time = true;
+                info_buf.LastWriteTime =
+                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+        } else
+                info_buf.LastWriteTime = 0;
+        /*
+         * Samba throws this field away, but windows may actually use it.
+         * Do not set ctime unless other time stamps are changed explicitly
+         * (i.e. by utimes()) since we would then have a mix of client and
+         * server times.
+         */
+        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+                cFYI(1, ("CIFS - CTIME changed"));
+                info_buf.ChangeTime =
+                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+        } else
+                info_buf.ChangeTime = 0;
+        info_buf.CreationTime = 0;      /* don't change */
+        info_buf.Attributes = cpu_to_le32(dosattr);
+        /*
+         * If the file is already open for write, just use that fileid
+         */
+        open_file = find_writable_file(cifsInode);
+        if (open_file) {
+                netfid = open_file->netfid;
+                netpid = open_file->pid;
+                goto set_via_filehandle;
+        }
+        /*
+         * NT4 apparently returns success on this call, but it doesn't
+         * really work.
+         */
+        if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
+                rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+                                     &info_buf, cifs_sb->local_nls,
+                                     cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc == 0) {
+                        cifsInode->cifsAttrs = dosattr;
+                        goto out;
+                } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
+                        goto out;
+        }
+        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+                 "times not supported by this server"));
+        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
+                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+                         CREATE_NOT_DIR, &netfid, &oplock,
+                         NULL, cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != 0) {
+                if (rc == -EIO)
+                        rc = -EINVAL;
+                goto out;
+        }
+        netpid = current->tgid;
+set_via_filehandle:
+        rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+        if (!rc)
+                cifsInode->cifsAttrs = dosattr;
+        if (open_file == NULL)
+                CIFSSMBClose(xid, pTcon, netfid);
+        else
+                atomic_dec(&open_file->wrtPending);
+out:
+        return rc;
+}
+/*
+ * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * and rename it to a random name that hopefully won't conflict with
+ * anything else.
+ */
+static int
+cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
+{
+        int oplock = 0;
+        int rc;
+        __u16 netfid;
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        __u32 dosattr;
+        FILE_BASIC_INFO *info_buf;
+        rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+                         DELETE|FILE_WRITE_ATTRIBUTES,
+                         CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
+                         &netfid, &oplock, NULL, cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != 0)
+                goto out;
+        /* set ATTR_HIDDEN and clear ATTR_READONLY */
+        cifsInode = CIFS_I(inode);
+        dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+        if (dosattr == 0)
+                dosattr |= ATTR_NORMAL;
+        dosattr |= ATTR_HIDDEN;
+        info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+        if (info_buf == NULL) {
+                rc = -ENOMEM;
+                goto out_close;
+        }
+        info_buf->Attributes = cpu_to_le32(dosattr);
+        rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
+        kfree(info_buf);
+        if (rc != 0)
+                goto out_close;
+        cifsInode->cifsAttrs = dosattr;
+        /* silly-rename the file */
+        CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+                                   cifs_sb->mnt_cifs_flags &
+                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
+        /* set DELETE_ON_CLOSE */
+        rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
+        /*
+         * some samba versions return -ENOENT when we try to set the file
+         * disposition here. Likely a samba bug, but work around it for now
+         */
+        if (rc == -ENOENT)
+                rc = 0;
+out_close:
+        CIFSSMBClose(xid, tcon, netfid);
+out:
+        return rc;
+}
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
        int rc = 0;
        int xid;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
-        struct cifsInodeInfo *cifsInode;
+        struct inode *inode = dentry->d_inode;
-        FILE_BASIC_INFO *pinfo_buf;
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct super_block *sb = dir->i_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
+        struct iattr *attrs = NULL;
+        __u32 dosattr = 0, origattr = 0;
-        cFYI(1, ("cifs_unlink, inode = 0x%p", inode));
+        cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
        xid = GetXid();
-        if (inode)
+        /* Unlink can be called from rename so we can not take the
-                cifs_sb = CIFS_SB(inode->i_sb);
+         * sb->s_vfs_rename_mutex here */
-        else
+        full_path = build_path_from_dentry(dentry);
-                cifs_sb = CIFS_SB(direntry->d_sb);
-        pTcon = cifs_sb->tcon;
-        /* Unlink can be called from rename so we can not grab the sem here
-           since we deadlock otherwise */
-/*      mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
-        full_path = build_path_from_dentry(direntry);
-/*      mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
        if (full_path == NULL) {
                FreeXid(xid);
                return -ENOMEM;
        }
-        if ((pTcon->ses->capabilities & CAP_UNIX) &&
+        if ((tcon->ses->capabilities & CAP_UNIX) &&
                (CIFS_UNIX_POSIX_PATH_OPS_CAP &
-                        le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
+                        le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                rc = CIFSPOSIXDelFile(xid, pTcon, full_path,
+                rc = CIFSPOSIXDelFile(xid, tcon, full_path,
                        SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
                cFYI(1, ("posix del rc %d", rc));
@@ -704,125 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
                        goto psx_del_no_retry;
        }
-        rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls,
+retry_std_delete:
+        rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 psx_del_no_retry:
        if (!rc) {
-                if (direntry->d_inode)
+                if (inode)
-                        drop_nlink(direntry->d_inode);
+                        drop_nlink(inode);
        } else if (rc == -ENOENT) {
-                d_drop(direntry);
+                d_drop(dentry);
        } else if (rc == -ETXTBSY) {
-                int oplock = 0;
+                rc = cifs_rename_pending_delete(full_path, inode, xid);
-                __u16 netfid;
+                if (rc == 0)
+                        drop_nlink(inode);
-                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE,
+        } else if (rc == -EACCES && dosattr == 0) {
-                                 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE,
+                attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
-                                 &netfid, &oplock, NULL, cifs_sb->local_nls,
+                if (attrs == NULL) {
-                                 cifs_sb->mnt_cifs_flags &
+                        rc = -ENOMEM;
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                        goto out_reval;
-                if (rc == 0) {
-                        CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
-                                              cifs_sb->local_nls,
-                                              cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        CIFSSMBClose(xid, pTcon, netfid);
-                        if (direntry->d_inode)
-                                drop_nlink(direntry->d_inode);
                }
-        } else if (rc == -EACCES) {
-                /* try only if r/o attribute set in local lookup data? */
-                pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
-                if (pinfo_buf) {
-                        /* ATTRS set to normal clears r/o bit */
-                        pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
-                        if (!(pTcon->ses->flags & CIFS_SES_NT4))
-                                rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
-                                                     pinfo_buf,
-                                                     cifs_sb->local_nls,
-                                                     cifs_sb->mnt_cifs_flags &
-                                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        else
-                                rc = -EOPNOTSUPP;
-                        if (rc == -EOPNOTSUPP) {
+                /* try to reset dos attributes */
-                                int oplock = 0;
+                origattr = cifsInode->cifsAttrs;
-                                __u16 netfid;
+                if (origattr == 0)
-                        /*      rc = CIFSSMBSetAttrLegacy(xid, pTcon,
+                        origattr |= ATTR_NORMAL;
-                                                          full_path,
+                dosattr = origattr & ~ATTR_READONLY;
-                                                          (__u16)ATTR_NORMAL,
+                if (dosattr == 0)
-                                                          cifs_sb->local_nls);
+                        dosattr |= ATTR_NORMAL;
-                           For some strange reason it seems that NT4 eats the
+                dosattr |= ATTR_HIDDEN;
-                           old setattr call without actually setting the
-                           attributes so on to the third attempted workaround
+                rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
-                           */
+                if (rc != 0)
+                        goto out_reval;
-                        /* BB could scan to see if we already have it open
-                           and pass in pid of opener to function */
+                goto retry_std_delete;
-                                rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                                 FILE_OPEN, SYNCHRONIZE |
-                                                 FILE_WRITE_ATTRIBUTES, 0,
-                                                 &netfid, &oplock, NULL,
-                                                 cifs_sb->local_nls,
-                                                 cifs_sb->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
-                                if (rc == 0) {
-                                        rc = CIFSSMBSetFileInfo(xid, pTcon,
-                                                                pinfo_buf,
-                                                                netfid,
-                                                                current->tgid);
-                                        CIFSSMBClose(xid, pTcon, netfid);
-                                }
-                        }
-                        kfree(pinfo_buf);
-                }
-                if (rc == 0) {
-                        rc = CIFSSMBDelFile(xid, pTcon, full_path,
-                                            cifs_sb->local_nls,
-                                            cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        if (!rc) {
-                                if (direntry->d_inode)
-                                        drop_nlink(direntry->d_inode);
-                        } else if (rc == -ETXTBSY) {
-                                int oplock = 0;
-                                __u16 netfid;
-                                rc = CIFSSMBOpen(xid, pTcon, full_path,
-                                                 FILE_OPEN, DELETE,
-                                                 CREATE_NOT_DIR |
-                                                 CREATE_DELETE_ON_CLOSE,
-                                                 &netfid, &oplock, NULL,
-                                                 cifs_sb->local_nls,
-                                                 cifs_sb->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
-                                if (rc == 0) {
-                                        CIFSSMBRenameOpenFile(xid, pTcon,
-                                                netfid, NULL,
-                                                cifs_sb->local_nls,
-                                                cifs_sb->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
-                                        CIFSSMBClose(xid, pTcon, netfid);
-                                        if (direntry->d_inode)
-                                                drop_nlink(direntry->d_inode);
-                                }
-                        /* BB if rc = -ETXTBUSY goto the rename logic BB */
-                        }
-                }
-        }
-        if (direntry->d_inode) {
-                cifsInode = CIFS_I(direntry->d_inode);
-                cifsInode->time = 0;    /* will force revalidate to get info
-                                           when needed */
-                direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
        }
+        /* undo the setattr if we errored out and it's needed */
+        if (rc != 0 && dosattr != 0)
+                cifs_set_file_info(inode, attrs, xid, full_path, origattr);
+out_reval:
        if (inode) {
-                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
                cifsInode = CIFS_I(inode);
-                cifsInode->time = 0;    /* force revalidate of dir as well */
+                cifsInode->time = 0;    /* will force revalidate to get info
+                                           when needed */
+                inode->i_ctime = current_fs_time(sb);
        }
+        dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+        cifsInode = CIFS_I(dir);
+        CIFS_I(dir)->time = 0;  /* force revalidate of dir as well */
        kfree(full_path);
+        kfree(attrs);
        FreeXid(xid);
        return rc;
 }
@@ -867,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
-        int rc = 0;
+        int rc = 0, tmprc;
        int xid;
        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
@@ -929,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
                                kfree(pInfo);
                                goto mkdir_get_info;
                        }
                        /* Is an i_ino of zero legal? */
                        /* Are there sanity checks we can use to ensure that
                           the server is really filling in that field? */
@@ -1017,12 +1116,20 @@ mkdir_get_info:
                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
                            (mode & S_IWUGO) == 0) {
                                FILE_BASIC_INFO pInfo;
+                                struct cifsInodeInfo *cifsInode;
+                                u32 dosattrs;
                                memset(&pInfo, 0, sizeof(pInfo));
-                                pInfo.Attributes = cpu_to_le32(ATTR_READONLY);
+                                cifsInode = CIFS_I(newinode);
-                                CIFSSMBSetPathInfo(xid, pTcon, full_path,
+                                dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
-                                                &pInfo, cifs_sb->local_nls,
+                                pInfo.Attributes = cpu_to_le32(dosattrs);
+                                tmprc = CIFSSMBSetPathInfo(xid, pTcon,
+                                                full_path, &pInfo,
+                                                cifs_sb->local_nls,
                                                cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+                                if (tmprc == 0)
+                                        cifsInode->cifsAttrs = dosattrs;
                        }
                        if (direntry->d_inode) {
                                if (cifs_sb->mnt_cifs_flags &
@@ -1094,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        return rc;
 }
+static int
+cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
+                struct dentry *to_dentry, const char *toPath)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
+        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        __u16 srcfid;
+        int oplock, rc;
+        /* try path-based rename first */
+        rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
+                           cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        /*
+         * don't bother with rename by filehandle unless file is busy and
+         * source Note that cross directory moves do not work with
+         * rename by filehandle to various Windows servers.
+         */
+        if (rc == 0 || rc != -ETXTBSY)
+                return rc;
+        /* open the file to be renamed -- we need DELETE perms */
+        rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
+                         CREATE_NOT_DIR, &srcfid, &oplock, NULL,
+                         cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc == 0) {
+                rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
+                                (const char *) to_dentry->d_name.name,
+                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                CIFSSMBClose(xid, pTcon, srcfid);
+        }
+        return rc;
+}
 int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
        struct inode *target_inode, struct dentry *target_direntry)
 {
-        char *fromName;
+        char *fromName = NULL;
-        char *toName;
+        char *toName = NULL;
        struct cifs_sb_info *cifs_sb_source;
        struct cifs_sb_info *cifs_sb_target;
        struct cifsTconInfo *pTcon;
+        FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+        FILE_UNIX_BASIC_INFO *info_buf_target;
        int xid;
-        int rc = 0;
+        int rc;
-        xid = GetXid();
        cifs_sb_target = CIFS_SB(target_inode->i_sb);
        cifs_sb_source = CIFS_SB(source_inode->i_sb);
        pTcon = cifs_sb_source->tcon;
+        xid = GetXid();
+        /*
+         * BB: this might be allowed if same server, but different share.
+         * Consider adding support for this
+         */
        if (pTcon != cifs_sb_target->tcon) {
-                FreeXid(xid);
+                rc = -EXDEV;
-                return -EXDEV;  /* BB actually could be allowed if same server,
+                goto cifs_rename_exit;
-                                   but different share.
-                                   Might eventually add support for this */
        }
-        /* we already  have the rename sem so we do not need to grab it again
+        /*
-           here to protect the path integrity */
+         * we already have the rename sem so we do not need to
+         * grab it again here to protect the path integrity
+         */
        fromName = build_path_from_dentry(source_direntry);
+        if (fromName == NULL) {
+                rc = -ENOMEM;
+                goto cifs_rename_exit;
+        }
        toName = build_path_from_dentry(target_direntry);
-        if ((fromName == NULL) || (toName == NULL)) {
+        if (toName == NULL) {
                rc = -ENOMEM;
                goto cifs_rename_exit;
        }
-        rc = CIFSSMBRename(xid, pTcon, fromName, toName,
+        rc = cifs_do_rename(xid, source_direntry, fromName,
-                           cifs_sb_source->local_nls,
+                            target_direntry, toName);
-                           cifs_sb_source->mnt_cifs_flags &
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc == -EEXIST) {
-                /* check if they are the same file because rename of hardlinked
+                if (pTcon->unix_ext) {
-                   files is a noop */
+                        /*
-                FILE_UNIX_BASIC_INFO *info_buf_source;
+                         * Are src and dst hardlinks of same inode? We can
-                FILE_UNIX_BASIC_INFO *info_buf_target;
+                         * only tell with unix extensions enabled
+                         */
-                info_buf_source =
+                        info_buf_source =
-                        kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+                                kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
-                if (info_buf_source != NULL) {
+                                                GFP_KERNEL);
+                        if (info_buf_source == NULL)
+                                goto unlink_target;
                        info_buf_target = info_buf_source + 1;
-                        if (pTcon->unix_ext)
+                        rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
-                                rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
+                                                info_buf_source,
-                                        info_buf_source,
+                                                cifs_sb_source->local_nls,
-                                        cifs_sb_source->local_nls,
+                                                cifs_sb_source->mnt_cifs_flags &
-                                        cifs_sb_source->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        /* else rc is still EEXIST so will fall through to
+                        if (rc != 0)
-                           unlink the target and retry rename */
+                                goto unlink_target;
-                        if (rc == 0) {
-                                rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName,
+                        rc = CIFSSMBUnixQPathInfo(xid, pTcon,
-                                                info_buf_target,
+                                                toName, info_buf_target,
                                                cifs_sb_target->local_nls,
                                                /* remap based on source sb */
                                                cifs_sb_source->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        }
-                        if ((rc == 0) &&
-                            (info_buf_source->UniqueId ==
-                             info_buf_target->UniqueId)) {
-                        /* do not rename since the files are hardlinked which
-                           is a noop */
-                        } else {
-                        /* we either can not tell the files are hardlinked
-                           (as with Windows servers) or files are not
-                           hardlinked so delete the target manually before
-                           renaming to follow POSIX rather than Windows
-                           semantics */
-                                cifs_unlink(target_inode, target_direntry);
-                                rc = CIFSSMBRename(xid, pTcon, fromName,
-                                                   toName,
-                                                   cifs_sb_source->local_nls,
-                                                   cifs_sb_source->mnt_cifs_flags
-                                                   & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        }
-                        kfree(info_buf_source);
-                } /* if we can not get memory just leave rc as EEXIST */
-        }
-        if (rc)
-                cFYI(1, ("rename rc %d", rc));
-        if ((rc == -EIO) || (rc == -EEXIST)) {
-                int oplock = 0;
-                __u16 netfid;
-                /* BB FIXME Is Generic Read correct for rename? */
-                /* if renaming directory - we should not say CREATE_NOT_DIR,
-                   need to test renaming open directory, also GENERIC_READ
-                   might not right be right access to request */
-                rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
-                                 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-                                 cifs_sb_source->local_nls,
-                                 cifs_sb_source->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc == 0) {
-                        rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
-                                              cifs_sb_source->local_nls,
-                                              cifs_sb_source->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        CIFSSMBClose(xid, pTcon, netfid);
-                }
+                        if (rc == 0 && (info_buf_source->UniqueId ==
+                                        info_buf_target->UniqueId))
+                                /* same file, POSIX says that this is a noop */
+                                goto cifs_rename_exit;
+                } /* else ... BB we could add the same check for Windows by
+                     checking the UniqueId via FILE_INTERNAL_INFO */
+unlink_target:
+                /*
+                 * we either can not tell the files are hardlinked (as with
+                 * Windows servers) or files are not hardlinked. Delete the
+                 * target manually before renaming to follow POSIX rather than
+                 * Windows semantics
+                 */
+                cifs_unlink(target_inode, target_direntry);
+                rc = cifs_do_rename(xid, source_direntry, fromName,
+                                    target_direntry, toName);
        }
 cifs_rename_exit:
+        kfree(info_buf_source);
        kfree(fromName);
        kfree(toName);
        FreeXid(xid);
@@ -1505,101 +1636,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 }
 static int
-cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
-                    char *full_path, __u32 dosattr)
-{
-        int rc;
-        int oplock = 0;
-        __u16 netfid;
-        __u32 netpid;
-        bool set_time = false;
-        struct cifsFileInfo *open_file;
-        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
-        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        struct cifsTconInfo *pTcon = cifs_sb->tcon;
-        FILE_BASIC_INFO info_buf;
-        if (attrs->ia_valid & ATTR_ATIME) {
-                set_time = true;
-                info_buf.LastAccessTime =
-                        cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
-        } else
-                info_buf.LastAccessTime = 0;
-        if (attrs->ia_valid & ATTR_MTIME) {
-                set_time = true;
-                info_buf.LastWriteTime =
-                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
-        } else
-                info_buf.LastWriteTime = 0;
-        /*
-         * Samba throws this field away, but windows may actually use it.
-         * Do not set ctime unless other time stamps are changed explicitly
-         * (i.e. by utimes()) since we would then have a mix of client and
-         * server times.
-         */
-        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-                cFYI(1, ("CIFS - CTIME changed"));
-                info_buf.ChangeTime =
-                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
-        } else
-                info_buf.ChangeTime = 0;
-        info_buf.CreationTime = 0;      /* don't change */
-        info_buf.Attributes = cpu_to_le32(dosattr);
-        /*
-         * If the file is already open for write, just use that fileid
-         */
-        open_file = find_writable_file(cifsInode);
-        if (open_file) {
-                netfid = open_file->netfid;
-                netpid = open_file->pid;
-                goto set_via_filehandle;
-        }
-        /*
-         * NT4 apparently returns success on this call, but it doesn't
-         * really work.
-         */
-        if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
-                rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
-                                     &info_buf, cifs_sb->local_nls,
-                                     cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != -EOPNOTSUPP && rc != -EINVAL)
-                        goto out;
-        }
-        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
-                 "times not supported by this server"));
-        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
-                         CREATE_NOT_DIR, &netfid, &oplock,
-                         NULL, cifs_sb->local_nls,
-                         cifs_sb->mnt_cifs_flags &
-                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-        if (rc != 0) {
-                if (rc == -EIO)
-                        rc = -EINVAL;
-                goto out;
-        }
-        netpid = current->tgid;
-set_via_filehandle:
-        rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
-        if (open_file == NULL)
-                CIFSSMBClose(xid, pTcon, netfid);
-        else
-                atomic_dec(&open_file->wrtPending);
-out:
-        return rc;
-}
-static int
 cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 {
        int rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe3157..88786ba02d27 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
   but it may be more efficient to always alloc same size
   albeit slightly larger than necessary and maxbuffersize
   defaults to this and can not be bigger */
-        ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp,
+        ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
-                                                   GFP_KERNEL | GFP_NOFS);
        /* clear the first few header bytes */
        /* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
   but it may be more efficient to always alloc same size
   albeit slightly larger than necessary and maxbuffersize
   defaults to this and can not be bigger */
-        ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp,
+        ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
-                                                   GFP_KERNEL | GFP_NOFS);
        if (ret_buf) {
        /* No need to clear memory here, cleared in header assemble */
        /*      memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
@@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
        buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
        buffer->Pid = cpu_to_le16((__u16)current->tgid);
        buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
-        spin_lock(&GlobalMid_Lock);
-        spin_unlock(&GlobalMid_Lock);
        if (treeCon) {
                buffer->Tid = treeCon->tid;
                if (treeCon->ses) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5f40ed3473f5..765adf12d54f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
 }
+static int cifs_save_resume_key(const char *current_entry,
+        struct cifsFileInfo *cifsFile)
+{
+        int rc = 0;
+        unsigned int len = 0;
+        __u16 level;
+        char *filename;
+        if ((cifsFile == NULL) || (current_entry == NULL))
+                return -EINVAL;
+        level = cifsFile->srch_inf.info_level;
+        if (level == SMB_FIND_FILE_UNIX) {
+                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                if (cifsFile->srch_inf.unicode) {
+                        len = cifs_unicode_bytelen(filename);
+                } else {
+                        /* BB should we make this strnlen of PATH_MAX? */
+                        len = strnlen(filename, PATH_MAX);
+                }
+                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+                FILE_DIRECTORY_INFO *pFindData =
+                        (FILE_DIRECTORY_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                len = le32_to_cpu(pFindData->FileNameLength);
+                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+        } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+                FILE_FULL_DIRECTORY_INFO *pFindData =
+                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                len = le32_to_cpu(pFindData->FileNameLength);
+                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+        } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+                SEARCH_ID_FULL_DIR_INFO *pFindData =
+                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                len = le32_to_cpu(pFindData->FileNameLength);
+                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+        } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+                FILE_BOTH_DIRECTORY_INFO *pFindData =
+                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                len = le32_to_cpu(pFindData->FileNameLength);
+                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+        } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
+                FIND_FILE_STANDARD_INFO *pFindData =
+                        (FIND_FILE_STANDARD_INFO *)current_entry;
+                filename = &pFindData->FileName[0];
+                /* one byte length, no name conversion */
+                len = (unsigned int)pFindData->FileNameLength;
+                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+        } else {
+                cFYI(1, ("Unknown findfirst level %d", level));
+                return -EINVAL;
+        }
+        cifsFile->srch_inf.resume_name_len = len;
+        cifsFile->srch_inf.presume_name = filename;
+        return rc;
+}
 /* find the corresponding entry in the search */
 /* Note that the SMB server returns search entries for . and .. which
   complicates logic here if we choose to parse for them and we do not
@@ -703,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
        while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
              (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
                cFYI(1, ("calling findnext2"));
+                cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
                rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
                                  &cifsFile->srch_inf);
                if (rc)
@@ -919,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
        return rc;
 }
-static int cifs_save_resume_key(const char *current_entry,
-        struct cifsFileInfo *cifsFile)
-{
-        int rc = 0;
-        unsigned int len = 0;
-        __u16 level;
-        char *filename;
-        if ((cifsFile == NULL) || (current_entry == NULL))
-                return -EINVAL;
-        level = cifsFile->srch_inf.info_level;
-        if (level == SMB_FIND_FILE_UNIX) {
-                FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                if (cifsFile->srch_inf.unicode) {
-                        len = cifs_unicode_bytelen(filename);
-                } else {
-                        /* BB should we make this strnlen of PATH_MAX? */
-                        len = strnlen(filename, PATH_MAX);
-                }
-                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-        } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
-                FILE_DIRECTORY_INFO *pFindData =
-                        (FILE_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-                FILE_FULL_DIRECTORY_INFO *pFindData =
-                        (FILE_FULL_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-                SEARCH_ID_FULL_DIR_INFO *pFindData =
-                        (SEARCH_ID_FULL_DIR_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-                FILE_BOTH_DIRECTORY_INFO *pFindData =
-                        (FILE_BOTH_DIRECTORY_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                len = le32_to_cpu(pFindData->FileNameLength);
-                cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-        } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-                FIND_FILE_STANDARD_INFO *pFindData =
-                        (FIND_FILE_STANDARD_INFO *)current_entry;
-                filename = &pFindData->FileName[0];
-                /* one byte length, no name conversion */
-                len = (unsigned int)pFindData->FileNameLength;
-                cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-        } else {
-                cFYI(1, ("Unknown findfirst level %d", level));
-                return -EINVAL;
-        }
-        cifsFile->srch_inf.resume_name_len = len;
-        cifsFile->srch_inf.presume_name = filename;
-        return rc;
-}
 int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed150efbe27c..2851d5da0c8c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
                /* no capabilities flags in old lanman negotiation */
                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
@@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == Kerberos) {
+        } else if (type == Kerberos || type == MSKerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
@@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                }
                msg = spnego_key->payload.data;
+                /* check version field to make sure that cifs.upcall is
+                   sending us a response in an expected form */
+                if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+                        cERROR(1, ("incorrect version of cifs.upcall (expected"
+                                   " %d but got %d)",
+                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+                        rc = -EKEYREJECTED;
+                        goto ssetup_exit;
+                }
                /* bail out if key is too long */
                if (msg->sesskey_len >
                    sizeof(ses->server->mac_signing_key.data.krb5)) {
@@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                                         ses, nls_cp);
 ssetup_exit:
-        if (spnego_key)
+        if (spnego_key) {
+                key_revoke(spnego_key);
                key_put(spnego_key);
+        }
        kfree(str_area);
        if (resp_buf_type == CIFS_SMALL_BUFFER) {
                cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e286db9f5ee2..bf0e6d8e382a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
                return NULL;
        }
-        temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp,
+        temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
-                                                    GFP_KERNEL | GFP_NOFS);
        if (temp == NULL)
                return temp;
        else {
diff --git a/fs/compat.c b/fs/compat.c
index c9d1472e65c5..075d0509970d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
        if (buf->result)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->result = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        buf->result++;
        dirent = buf->dirent;
        if (!access_ok(VERIFY_WRITE, dirent,
@@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
        if (reclen > buf->count)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->error = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        dirent = buf->previous;
        if (dirent) {
                if (__put_user(offset, &dirent->d_off))
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7a8db78a91d2..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
         * Ensure that no racing symlink() will make detach_prep() fail while
         * the new link is temporarily attached
         */
-        mutex_lock(&configfs_symlink_mutex);
-        spin_lock(&configfs_dirent_lock);
        do {
                struct mutex *wait_mutex;
+                mutex_lock(&configfs_symlink_mutex);
+                spin_lock(&configfs_dirent_lock);
                ret = configfs_detach_prep(dentry, &wait_mutex);
-                if (ret) {
+                if (ret)
                        configfs_detach_rollback(dentry);
-                        spin_unlock(&configfs_dirent_lock);
+                spin_unlock(&configfs_dirent_lock);
-                        mutex_unlock(&configfs_symlink_mutex);
+                mutex_unlock(&configfs_symlink_mutex);
+                if (ret) {
                        if (ret != -EAGAIN) {
                                config_item_put(parent_item);
                                return ret;
@@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                        /* Wait until the racing operation terminates */
                        mutex_lock(wait_mutex);
                        mutex_unlock(wait_mutex);
-                        mutex_lock(&configfs_symlink_mutex);
-                        spin_lock(&configfs_dirent_lock);
                }
        } while (ret == -EAGAIN);
-        spin_unlock(&configfs_dirent_lock);
-        mutex_unlock(&configfs_symlink_mutex);
        /* Get a working ref for the duration of this function */
        item = configfs_get_config_item(dentry);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c3b618c15b3..f40423eb1a14 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);
 static int cramfs_iget5_test(struct inode *inode, void *opaque)
 {
        struct cramfs_inode *cramfs_inode = opaque;
+        return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
-        if (inode->i_ino != CRAMINO(cramfs_inode))
-                return 0; /* does not match */
-        if (inode->i_ino != 1)
-                return 1;
-        /* all empty directories, char, block, pipe, and sock, share inode #1 */
-        if ((inode->i_mode != cramfs_inode->mode) ||
-            (inode->i_gid != cramfs_inode->gid) ||
-            (inode->i_uid != cramfs_inode->uid))
-                return 0; /* does not match */
-        if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
-            (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
-                return 0; /* does not match */
-        return 1; /* matches */
 }
 static int cramfs_iget5_set(struct inode *inode, void *opaque)
 {
-        static struct timespec zerotime;
        struct cramfs_inode *cramfs_inode = opaque;
-        inode->i_mode = cramfs_inode->mode;
-        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-        inode->i_gid = cramfs_inode->gid;
-        /* Struct copy intentional */
-        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        inode->i_ino = CRAMINO(cramfs_inode);
-        /* inode->i_nlink is left 1 - arguably wrong for directories,
-           but it's the best we can do without reading the directory
-           contents.  1 yields the right result in GNU find, even
-           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
        return 0;
 }
@@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
        struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
                                            cramfs_iget5_test, cramfs_iget5_set,
                                            cramfs_inode);
+        static struct timespec zerotime;
        if (inode && (inode->i_state & I_NEW)) {
+                inode->i_mode = cramfs_inode->mode;
+                inode->i_uid = cramfs_inode->uid;
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+                inode->i_gid = cramfs_inode->gid;
+                /* Struct copy intentional */
+                inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+                /* inode->i_nlink is left 1 - arguably wrong for directories,
+                   but it's the best we can do without reading the directory
+                   contents.  1 yields the right result in GNU find, even
+                   without -noleaf option. */
+                if (S_ISREG(inode->i_mode)) {
+                        inode->i_fop = &generic_ro_fops;
+                        inode->i_data.a_ops = &cramfs_aops;
+                } else if (S_ISDIR(inode->i_mode)) {
+                        inode->i_op = &cramfs_dir_inode_operations;
+                        inode->i_fop = &cramfs_directory_operations;
+                } else if (S_ISLNK(inode->i_mode)) {
+                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_data.a_ops = &cramfs_aops;
+                } else {
+                        inode->i_size = 0;
+                        inode->i_blocks = 0;
+                        init_special_inode(inode, inode->i_mode,
+                                old_decode_dev(cramfs_inode->size));
+                }
                unlock_new_inode(inode);
        }
        return inode;
 }
+static void cramfs_drop_inode(struct inode *inode)
+{
+        if (inode->i_ino == 1)
+                generic_delete_inode(inode);
+        else
+                generic_drop_inode(inode);
+}
 /*
 * We have our own block cache: don't fill up the buffer cache
 * with the rom-image, because the way the filesystem is set
@@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {
        .put_super      = cramfs_put_super,
        .remount_fs     = cramfs_remount,
        .statfs         = cramfs_statfs,
+        .drop_inode     = cramfs_drop_inode,
 };
 static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index 101663d15e9f..e7a1a99b7464 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
-struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry,
+struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
 {
        int error;
@@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                if (dentry->d_parent != parent)
                        goto next;
+                /* non-existing due to RCU? */
+                if (d_unhashed(dentry))
+                        goto next;
                /*
                 * It is safe to compare names since d_move() cannot
                 * change the qstr (protected by d_lock).
@@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
                                goto next;
                }
-                if (!d_unhashed(dentry)) {
+                atomic_inc(&dentry->d_count);
-                        atomic_inc(&dentry->d_count);
+                found = dentry;
-                        found = dentry;
-                }
                spin_unlock(&dentry->d_lock);
                break;
 next:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 08e28c9bb416..3dbe2169cf36 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -26,8 +26,7 @@
 #include <linux/debugfs.h>
 #include <linux/fsnotify.h>
 #include <linux/string.h>
+#include <linux/magic.h>
-#define DEBUGFS_MAGIC   0x64626720
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 488eb424f662..4a714f6c1bed 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,6 +27,7 @@
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 #define DEVPTS_DEFAULT_MODE 0600
+#define PTMX_MINOR      2
 extern int pty_limit;                   /* Config limit on Unix98 ptys */
 static DEFINE_IDA(allocated_ptys);
@@ -48,7 +49,7 @@ enum {
        Opt_err
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%o"},
@@ -169,15 +170,7 @@ static struct file_system_type devpts_fs_type = {
 * to the System V naming convention
 */
-static struct dentry *get_node(int num)
+int devpts_new_index(struct inode *ptmx_inode)
-{
-        char s[12];
-        struct dentry *root = devpts_root;
-        mutex_lock(&root->d_inode->i_mutex);
-        return lookup_one_len(s, root, sprintf(s, "%d", num));
-}
-int devpts_new_index(void)
 {
        int index;
        int ida_ret;
@@ -205,20 +198,21 @@ retry:
        return index;
 }
-void devpts_kill_index(int idx)
+void devpts_kill_index(struct inode *ptmx_inode, int idx)
 {
        mutex_lock(&allocated_ptys_lock);
        ida_remove(&allocated_ptys, idx);
        mutex_unlock(&allocated_ptys_lock);
 }
-int devpts_pty_new(struct tty_struct *tty)
+int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 {
        int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
        struct tty_driver *driver = tty->driver;
        dev_t device = MKDEV(driver->major, driver->minor_start+number);
        struct dentry *dentry;
        struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+        char s[12];
        /* We're supposed to be given the slave end of a pty */
        BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
@@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty)
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        init_special_inode(inode, S_IFCHR|config.mode, device);
        inode->i_private = tty;
+        tty->driver_data = inode;
-        dentry = get_node(number);
+        sprintf(s, "%d", number);
-        if (!IS_ERR(dentry) && !dentry->d_inode) {
-                d_instantiate(dentry, inode);
+        mutex_lock(&devpts_root->d_inode->i_mutex);
+        dentry = d_alloc_name(devpts_root, s);
+        if (!IS_ERR(dentry)) {
+                d_add(dentry, inode);
                fsnotify_create(devpts_root->d_inode, dentry);
        }
@@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty)
        return 0;
 }
-struct tty_struct *devpts_get_tty(int number)
+struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 {
-        struct dentry *dentry = get_node(number);
+        BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-        struct tty_struct *tty;
-        tty = NULL;
-        if (!IS_ERR(dentry)) {
-                if (dentry->d_inode)
-                        tty = dentry->d_inode->i_private;
-                dput(dentry);
-        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+                return (struct tty_struct *)pts_inode->i_private;
-        return tty;
+        return NULL;
 }
-void devpts_pty_kill(int number)
+void devpts_pty_kill(struct tty_struct *tty)
 {
-        struct dentry *dentry = get_node(number);
+        struct inode *inode = tty->driver_data;
+        struct dentry *dentry;
-        if (!IS_ERR(dentry)) {
+        BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-                struct inode *inode = dentry->d_inode;
-                if (inode) {
+        mutex_lock(&devpts_root->d_inode->i_mutex);
-                        inode->i_nlink--;
-                        d_delete(dentry);
+        dentry = d_find_alias(inode);
-                        dput(dentry);
+        if (dentry && !IS_ERR(dentry)) {
-                }
+                inode->i_nlink--;
+                d_delete(dentry);
                dput(dentry);
        }
        mutex_unlock(&devpts_root->d_inode->i_mutex);
 }
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c4e7d721bd8d..fd9859f92fad 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <net/sock.h>
 #include "config.h"
@@ -30,16 +33,16 @@
 static struct config_group *space_list;
 static struct config_group *comm_list;
-static struct comm *local_comm;
+static struct dlm_comm *local_comm;
-struct clusters;
+struct dlm_clusters;
-struct cluster;
+struct dlm_cluster;
-struct spaces;
+struct dlm_spaces;
-struct space;
+struct dlm_space;
-struct comms;
+struct dlm_comms;
-struct comm;
+struct dlm_comm;
-struct nodes;
+struct dlm_nodes;
-struct node;
+struct dlm_node;
 static struct config_group *make_cluster(struct config_group *, const char *);
 static void drop_cluster(struct config_group *, struct config_item *);
@@ -68,17 +71,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len);
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
-static ssize_t comm_local_read(struct comm *cm, char *buf);
+                                size_t len);
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
-static ssize_t node_nodeid_read(struct node *nd, char *buf);
+                                size_t len);
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
-static ssize_t node_weight_read(struct node *nd, char *buf);
+                                size_t len);
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
-struct cluster {
+                                size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+                                size_t len);
+struct dlm_cluster {
        struct config_group group;
        unsigned int cl_tcp_port;
        unsigned int cl_buffer_size;
@@ -109,11 +117,11 @@ enum {
 struct cluster_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct cluster *, char *);
+        ssize_t (*show)(struct dlm_cluster *, char *);
-        ssize_t (*store)(struct cluster *, const char *, size_t);
+        ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
 };
-static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
                           int *info_field, int check_zero,
                           const char *buf, size_t len)
 {
@@ -134,12 +142,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
 }
 #define CLUSTER_ATTR(name, check_zero)                                        \
-static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
 {                                                                             \
        return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
                           check_zero, buf, len);                             \
 }                                                                             \
-static ssize_t name##_read(struct cluster *cl, char *buf)                     \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
 {                                                                             \
        return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
 }                                                                             \
@@ -181,8 +189,8 @@ enum {
 struct comm_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct comm *, char *);
+        ssize_t (*show)(struct dlm_comm *, char *);
-        ssize_t (*store)(struct comm *, const char *, size_t);
+        ssize_t (*store)(struct dlm_comm *, const char *, size_t);
 };
 static struct comm_attribute comm_attr_nodeid = {
@@ -222,8 +230,8 @@ enum {
 struct node_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct node *, char *);
+        ssize_t (*show)(struct dlm_node *, char *);
-        ssize_t (*store)(struct node *, const char *, size_t);
+        ssize_t (*store)(struct dlm_node *, const char *, size_t);
 };
 static struct node_attribute node_attr_nodeid = {
@@ -248,26 +256,26 @@ static struct configfs_attribute *node_attrs[] = {
        NULL,
 };
-struct clusters {
+struct dlm_clusters {
        struct configfs_subsystem subsys;
 };
-struct spaces {
+struct dlm_spaces {
        struct config_group ss_group;
 };
-struct space {
+struct dlm_space {
        struct config_group group;
        struct list_head members;
        struct mutex members_lock;
        int members_count;
 };
-struct comms {
+struct dlm_comms {
        struct config_group cs_group;
 };
-struct comm {
+struct dlm_comm {
        struct config_item item;
        int nodeid;
        int local;
@@ -275,11 +283,11 @@ struct comm {
        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
-struct nodes {
+struct dlm_nodes {
        struct config_group ns_group;
 };
-struct node {
+struct dlm_node {
        struct config_item item;
        struct list_head list; /* space->members */
        int nodeid;
@@ -372,38 +380,40 @@ static struct config_item_type node_type = {
        .ct_owner = THIS_MODULE,
 };
-static struct cluster *to_cluster(struct config_item *i)
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
 {
-        return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+        return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+                   NULL;
 }
-static struct space *to_space(struct config_item *i)
+static struct dlm_space *config_item_to_space(struct config_item *i)
 {
-        return i ? container_of(to_config_group(i), struct space, group) : NULL;
+        return i ? container_of(to_config_group(i), struct dlm_space, group) :
+                   NULL;
 }
-static struct comm *to_comm(struct config_item *i)
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
 {
-        return i ? container_of(i, struct comm, item) : NULL;
+        return i ? container_of(i, struct dlm_comm, item) : NULL;
 }
-static struct node *to_node(struct config_item *i)
+static struct dlm_node *config_item_to_node(struct config_item *i)
 {
-        return i ? container_of(i, struct node, item) : NULL;
+        return i ? container_of(i, struct dlm_node, item) : NULL;
 }
 static struct config_group *make_cluster(struct config_group *g,
                                         const char *name)
 {
-        struct cluster *cl = NULL;
+        struct dlm_cluster *cl = NULL;
-        struct spaces *sps = NULL;
+        struct dlm_spaces *sps = NULL;
-        struct comms *cms = NULL;
+        struct dlm_comms *cms = NULL;
        void *gps = NULL;
-        cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+        cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-        sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
-        cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
        if (!cl || !gps || !sps || !cms)
                goto fail;
@@ -443,7 +453,7 @@ static struct config_group *make_cluster(struct config_group *g,
 static void drop_cluster(struct config_group *g, struct config_item *i)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = config_item_to_cluster(i);
        struct config_item *tmp;
        int j;
@@ -461,20 +471,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
 static void release_cluster(struct config_item *i)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = config_item_to_cluster(i);
        kfree(cl->group.default_groups);
        kfree(cl);
 }
 static struct config_group *make_space(struct config_group *g, const char *name)
 {
-        struct space *sp = NULL;
+        struct dlm_space *sp = NULL;
-        struct nodes *nds = NULL;
+        struct dlm_nodes *nds = NULL;
        void *gps = NULL;
-        sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+        sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
-        nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
        if (!sp || !gps || !nds)
                goto fail;
@@ -500,7 +510,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 static void drop_space(struct config_group *g, struct config_item *i)
 {
-        struct space *sp = to_space(i);
+        struct dlm_space *sp = config_item_to_space(i);
        struct config_item *tmp;
        int j;
@@ -517,16 +527,16 @@ static void drop_space(struct config_group *g, struct config_item *i)
 static void release_space(struct config_item *i)
 {
-        struct space *sp = to_space(i);
+        struct dlm_space *sp = config_item_to_space(i);
        kfree(sp->group.default_groups);
        kfree(sp);
 }
 static struct config_item *make_comm(struct config_group *g, const char *name)
 {
-        struct comm *cm;
+        struct dlm_comm *cm;
-        cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+        cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
        if (!cm)
                return ERR_PTR(-ENOMEM);
@@ -539,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 static void drop_comm(struct config_group *g, struct config_item *i)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = config_item_to_comm(i);
        if (local_comm == cm)
                local_comm = NULL;
        dlm_lowcomms_close(cm->nodeid);
@@ -550,16 +560,16 @@ static void drop_comm(struct config_group *g, struct config_item *i)
 static void release_comm(struct config_item *i)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = config_item_to_comm(i);
        kfree(cm);
 }
 static struct config_item *make_node(struct config_group *g, const char *name)
 {
-        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
-        struct node *nd;
+        struct dlm_node *nd;
-        nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+        nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
        if (!nd)
                return ERR_PTR(-ENOMEM);
@@ -578,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 static void drop_node(struct config_group *g, struct config_item *i)
 {
-        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = config_item_to_node(i);
        mutex_lock(&sp->members_lock);
        list_del(&nd->list);
@@ -591,11 +601,11 @@ static void drop_node(struct config_group *g, struct config_item *i)
 static void release_node(struct config_item *i)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = config_item_to_node(i);
        kfree(nd);
 }
-static struct clusters clusters_root = {
+static struct dlm_clusters clusters_root = {
        .subsys = {
                .su_group = {
                        .cg_item = {
@@ -625,7 +635,7 @@ void dlm_config_exit(void)
 static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
                            char *buf)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = config_item_to_cluster(i);
        struct cluster_attribute *cla =
                        container_of(a, struct cluster_attribute, attr);
        return cla->show ? cla->show(cl, buf) : 0;
@@ -635,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
                             struct configfs_attribute *a,
                             const char *buf, size_t len)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = config_item_to_cluster(i);
        struct cluster_attribute *cla =
                container_of(a, struct cluster_attribute, attr);
        return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -644,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = config_item_to_comm(i);
        struct comm_attribute *cma =
                        container_of(a, struct comm_attribute, attr);
        return cma->show ? cma->show(cm, buf) : 0;
@@ -653,29 +663,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = config_item_to_comm(i);
        struct comm_attribute *cma =
                container_of(a, struct comm_attribute, attr);
        return cma->store ? cma->store(cm, buf, len) : -EINVAL;
 }
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
 {
        return sprintf(buf, "%d\n", cm->nodeid);
 }
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+                                 size_t len)
 {
        cm->nodeid = simple_strtol(buf, NULL, 0);
        return len;
 }
-static ssize_t comm_local_read(struct comm *cm, char *buf)
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
 {
        return sprintf(buf, "%d\n", cm->local);
 }
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+                                size_t len)
 {
        cm->local= simple_strtol(buf, NULL, 0);
        if (cm->local && !local_comm)
@@ -683,7 +695,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
        return len;
 }
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 {
        struct sockaddr_storage *addr;
@@ -705,7 +717,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
 static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = config_item_to_node(i);
        struct node_attribute *nda =
                        container_of(a, struct node_attribute, attr);
        return nda->show ? nda->show(nd, buf) : 0;
@@ -714,29 +726,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = config_item_to_node(i);
        struct node_attribute *nda =
                container_of(a, struct node_attribute, attr);
        return nda->store ? nda->store(nd, buf, len) : -EINVAL;
 }
-static ssize_t node_nodeid_read(struct node *nd, char *buf)
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
 {
        return sprintf(buf, "%d\n", nd->nodeid);
 }
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+                                 size_t len)
 {
        nd->nodeid = simple_strtol(buf, NULL, 0);
        return len;
 }
-static ssize_t node_weight_read(struct node *nd, char *buf)
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
 {
        return sprintf(buf, "%d\n", nd->weight);
 }
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+                                 size_t len)
 {
        nd->weight = simple_strtol(buf, NULL, 0);
        return len;
@@ -746,7 +760,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
 * Functions for the dlm to get the info that's been configured
 */
-static struct space *get_space(char *name)
+static struct dlm_space *get_space(char *name)
 {
        struct config_item *i;
@@ -757,18 +771,45 @@ static struct space *get_space(char *name)
        i = config_group_find_item(space_list, name);
        mutex_unlock(&space_list->cg_subsys->su_mutex);
-        return to_space(i);
+        return config_item_to_space(i);
 }
-static void put_space(struct space *sp)
+static void put_space(struct dlm_space *sp)
 {
        config_item_put(&sp->group.cg_item);
 }
-static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+        switch (x->ss_family) {
+        case AF_INET: {
+                struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+                struct sockaddr_in *siny = (struct sockaddr_in *)y;
+                if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+                        return 0;
+                if (sinx->sin_port != siny->sin_port)
+                        return 0;
+                break;
+        }
+        case AF_INET6: {
+                struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+                struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+                if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+                        return 0;
+                if (sinx->sin6_port != siny->sin6_port)
+                        return 0;
+                break;
+        }
+        default:
+                return 0;
+        }
+        return 1;
+}
+static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 {
        struct config_item *i;
-        struct comm *cm = NULL;
+        struct dlm_comm *cm = NULL;
        int found = 0;
        if (!comm_list)
@@ -777,7 +818,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        mutex_lock(&clusters_root.subsys.su_mutex);
        list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
-                cm = to_comm(i);
+                cm = config_item_to_comm(i);
                if (nodeid) {
                        if (cm->nodeid != nodeid)
@@ -786,8 +827,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
                        config_item_get(i);
                        break;
                } else {
-                        if (!cm->addr_count ||
+                        if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
-                            memcmp(cm->addr[0], addr, sizeof(*addr)))
                                continue;
                        found = 1;
                        config_item_get(i);
@@ -801,7 +841,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        return cm;
 }
-static void put_comm(struct comm *cm)
+static void put_comm(struct dlm_comm *cm)
 {
        config_item_put(&cm->item);
 }
@@ -810,8 +850,8 @@ static void put_comm(struct comm *cm)
 int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
                    int **new_out, int *new_count_out)
 {
-        struct space *sp;
+        struct dlm_space *sp;
-        struct node *nd;
+        struct dlm_node *nd;
        int i = 0, rv = 0, ids_count = 0, new_count = 0;
        int *ids, *new;
@@ -874,8 +914,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 int dlm_node_weight(char *lsname, int nodeid)
 {
-        struct space *sp;
+        struct dlm_space *sp;
-        struct node *nd;
+        struct dlm_node *nd;
        int w = -EEXIST;
        sp = get_space(lsname);
@@ -897,7 +937,7 @@ int dlm_node_weight(char *lsname, int nodeid)
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 {
-        struct comm *cm = get_comm(nodeid, NULL);
+        struct dlm_comm *cm = get_comm(nodeid, NULL);
        if (!cm)
                return -EEXIST;
        if (!cm->addr_count)
@@ -909,7 +949,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
-        struct comm *cm = get_comm(0, addr);
+        struct dlm_comm *cm = get_comm(0, addr);
        if (!cm)
                return -EEXIST;
        *nodeid = cm->nodeid;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629c..868e4c9ef127 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -441,8 +441,11 @@ struct dlm_ls {
        uint32_t                ls_global_id;   /* global unique lockspace ID */
        uint32_t                ls_exflags;
        int                     ls_lvblen;
-        int                     ls_count;       /* reference count */
+        int                     ls_count;       /* refcount of processes in
+                                                   the dlm using this ls */
+        int                     ls_create_count; /* create/release refcount */
        unsigned long           ls_flags;       /* LSFL_ */
+        unsigned long           ls_scan_time;
        struct kobject          ls_kobj;
        struct dlm_rsbtable     *ls_rsbtbl;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e96..d910501de6d2 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
 #include "lock.h"
 #include "recover.h"
 #include "requestqueue.h"
+#include "user.h"
 static int                      ls_count;
 static struct mutex             ls_lock;
@@ -211,19 +212,41 @@ void dlm_lockspace_exit(void)
        kset_unregister(dlm_kset);
 }
+static struct dlm_ls *find_ls_to_scan(void)
+{
+        struct dlm_ls *ls;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (time_after_eq(jiffies, ls->ls_scan_time +
+                                            dlm_config.ci_scan_secs * HZ)) {
+                        spin_unlock(&lslist_lock);
+                        return ls;
+                }
+        }
+        spin_unlock(&lslist_lock);
+        return NULL;
+}
 static int dlm_scand(void *data)
 {
        struct dlm_ls *ls;
+        int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
        while (!kthread_should_stop()) {
-                list_for_each_entry(ls, &lslist, ls_list) {
+                ls = find_ls_to_scan();
+                if (ls) {
                        if (dlm_lock_recovery_try(ls)) {
+                                ls->ls_scan_time = jiffies;
                                dlm_scan_rsbs(ls);
                                dlm_scan_timeout(ls);
                                dlm_unlock_recovery(ls);
+                        } else {
+                                ls->ls_scan_time += HZ;
                        }
+                } else {
+                        schedule_timeout_interruptible(timeout_jiffies);
                }
-                schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
        }
        return 0;
 }
@@ -246,23 +269,6 @@ static void dlm_scand_stop(void)
        kthread_stop(scand_task);
 }
-static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
-{
-        struct dlm_ls *ls;
-        spin_lock(&lslist_lock);
-        list_for_each_entry(ls, &lslist, ls_list) {
-                if (ls->ls_namelen == namelen &&
-                    memcmp(ls->ls_name, name, namelen) == 0)
-                        goto out;
-        }
-        ls = NULL;
- out:
-        spin_unlock(&lslist_lock);
-        return ls;
-}
 struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
 {
        struct dlm_ls *ls;
@@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls)
        for (;;) {
                spin_lock(&lslist_lock);
                if (ls->ls_count == 0) {
+                        WARN_ON(ls->ls_create_count != 0);
                        list_del(&ls->ls_list);
                        spin_unlock(&lslist_lock);
                        return;
@@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                         uint32_t flags, int lvblen)
 {
        struct dlm_ls *ls;
-        int i, size, error = -ENOMEM;
+        int i, size, error;
        int do_unreg = 0;
        if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        if (!try_module_get(THIS_MODULE))
                return -EINVAL;
-        ls = dlm_find_lockspace_name(name, namelen);
+        if (!dlm_user_daemon_available()) {
-        if (ls) {
+                module_put(THIS_MODULE);
-                *lockspace = ls;
+                return -EUNATCH;
+        }
+        error = 0;
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                WARN_ON(ls->ls_create_count <= 0);
+                if (ls->ls_namelen != namelen)
+                        continue;
+                if (memcmp(ls->ls_name, name, namelen))
+                        continue;
+                if (flags & DLM_LSFL_NEWEXCL) {
+                        error = -EEXIST;
+                        break;
+                }
+                ls->ls_create_count++;
                module_put(THIS_MODULE);
-                return -EEXIST;
+                error = 1; /* not an error, return 0 */
+                break;
        }
+        spin_unlock(&lslist_lock);
+        if (error < 0)
+                goto out;
+        if (error)
+                goto ret_zero;
+        error = -ENOMEM;
        ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
        if (!ls)
@@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        ls->ls_lvblen = lvblen;
        ls->ls_count = 0;
        ls->ls_flags = 0;
+        ls->ls_scan_time = jiffies;
        if (flags & DLM_LSFL_TIMEWARN)
                set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                ls->ls_allocation = GFP_KERNEL;
        /* ls_exflags are forced to match among nodes, and we don't
-           need to require all nodes to have TIMEWARN or FS set */
+           need to require all nodes to have some flags set */
-        ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS));
+        ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+                                    DLM_LSFL_NEWEXCL));
        size = dlm_config.ci_rsbtbl_size;
        ls->ls_rsbtbl_size = size;
@@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        down_write(&ls->ls_in_recovery);
        spin_lock(&lslist_lock);
+        ls->ls_create_count = 1;
        list_add(&ls->ls_list, &lslist);
        spin_unlock(&lslist_lock);
@@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        dlm_create_debug_file(ls);
        log_debug(ls, "join complete");
+ ret_zero:
        *lockspace = ls;
        return 0;
@@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force)
        struct dlm_lkb *lkb;
        struct dlm_rsb *rsb;
        struct list_head *head;
-        int i;
+        int i, busy, rv;
-        int busy = lockspace_busy(ls);
+        busy = lockspace_busy(ls);
+        spin_lock(&lslist_lock);
+        if (ls->ls_create_count == 1) {
+                if (busy > force)
+                        rv = -EBUSY;
+                else {
+                        /* remove_lockspace takes ls off lslist */
+                        ls->ls_create_count = 0;
+                        rv = 0;
+                }
+        } else if (ls->ls_create_count > 1) {
+                rv = --ls->ls_create_count;
+        } else {
+                rv = -EINVAL;
+        }
+        spin_unlock(&lslist_lock);
-        if (busy > force)
+        if (rv) {
-                return -EBUSY;
+                log_debug(ls, "release_lockspace no remove %d", rv);
+                return rv;
+        }
+        dlm_device_deregister(ls);
-        if (force < 3)
+        if (force < 3 && dlm_user_daemon_available())
                do_uevent(ls, 0);
        dlm_recoverd_stop(ls);
@@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
        dlm_clear_members(ls);
        dlm_clear_members_gone(ls);
        kfree(ls->ls_node_array);
+        log_debug(ls, "release_lockspace final free");
        kobject_put(&ls->ls_kobj);
        /* The ls structure will be freed when the kobject is done with */
-        mutex_lock(&ls_lock);
-        ls_count--;
-        if (!ls_count)
-                threads_stop();
-        mutex_unlock(&ls_lock);
        module_put(THIS_MODULE);
        return 0;
 }
@@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 int dlm_release_lockspace(void *lockspace, int force)
 {
        struct dlm_ls *ls;
+        int error;
        ls = dlm_find_lockspace_local(lockspace);
        if (!ls)
                return -EINVAL;
        dlm_put_lockspace(ls);
-        return release_lockspace(ls, force);
+        mutex_lock(&ls_lock);
+        error = release_lockspace(ls, force);
+        if (!error)
+                ls_count--;
+        else if (!ls_count)
+                threads_stop();
+        mutex_unlock(&ls_lock);
+        return error;
+}
+void dlm_stop_lockspaces(void)
+{
+        struct dlm_ls *ls;
+ restart:
+        spin_lock(&lslist_lock);
+        list_for_each_entry(ls, &lslist, ls_list) {
+                if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+                        continue;
+                spin_unlock(&lslist_lock);
+                log_error(ls, "no userland control daemon, stopping lockspace");
+                dlm_ls_stop(ls);
+                goto restart;
+        }
+        spin_unlock(&lslist_lock);
 }
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd021..f879f87901f8 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
 struct dlm_ls *dlm_find_lockspace_local(void *id);
 struct dlm_ls *dlm_find_lockspace_device(int minor);
 void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
 #endif                          /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 929e48ae7591..b3832c67194a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -15,7 +15,6 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
-#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
@@ -27,6 +26,8 @@
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
 #ifdef CONFIG_COMPAT
@@ -340,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
        return error;
 }
-static int create_misc_device(struct dlm_ls *ls, char *name)
+static int dlm_device_register(struct dlm_ls *ls, char *name)
 {
        int error, len;
+        /* The device is already registered.  This happens when the
+           lockspace is created multiple times from userspace. */
+        if (ls->ls_device.name)
+                return 0;
        error = -ENOMEM;
        len = strlen(name) + strlen(name_prefix) + 2;
        ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -363,6 +369,22 @@ fail:
        return error;
 }
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+        int error;
+        /* The device is not registered.  This happens when the lockspace
+           was never used from userspace, or when device_create_lockspace()
+           calls dlm_release_lockspace() after the register fails. */
+        if (!ls->ls_device.name)
+                return 0;
+        error = misc_deregister(&ls->ls_device);
+        if (!error)
+                kfree(ls->ls_device.name);
+        return error;
+}
 static int device_user_purge(struct dlm_user_proc *proc,
                             struct dlm_purge_params *params)
 {
@@ -397,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
        if (!ls)
                return -ENOENT;
-        error = create_misc_device(ls, params->name);
+        error = dlm_device_register(ls, params->name);
        dlm_put_lockspace(ls);
        if (error)
@@ -421,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
        if (!ls)
                return -ENOENT;
-        /* Deregister the misc device first, so we don't have
-         * a device that's not attached to a lockspace. If
-         * dlm_release_lockspace fails then we can recreate it
-         */
-        error = misc_deregister(&ls->ls_device);
-        if (error) {
-                dlm_put_lockspace(ls);
-                goto out;
-        }
-        kfree(ls->ls_device.name);
        if (params->flags & DLM_USER_LSFLG_FORCEFREE)
                force = 2;
        lockspace = ls->ls_local_handle;
+        dlm_put_lockspace(ls);
-        /* dlm_release_lockspace waits for references to go to zero,
+        /* The final dlm_release_lockspace waits for references to go to
-           so all processes will need to close their device for the ls
+           zero, so all processes will need to close their device for the
-           before the release will procede */
+           ls before the release will proceed.  release also calls the
+           device_deregister above.  Converting a positive return value
+           from release to zero means that userspace won't know when its
+           release was the final one, but it shouldn't need to know. */
-        dlm_put_lockspace(ls);
        error = dlm_release_lockspace(lockspace, force);
-        if (error)
+        if (error > 0)
-                create_misc_device(ls, ls->ls_name);
+                error = 0;
- out:
        return error;
 }
@@ -527,8 +540,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                k32buf = (struct dlm_write_request32 *)kbuf;
                kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
                               sizeof(struct dlm_write_request32)), GFP_KERNEL);
-                if (!kbuf)
+                if (!kbuf) {
+                        kfree(k32buf);
                        return -ENOMEM;
+                }
                if (proc)
                        set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
@@ -539,8 +554,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
        /* do we really need this? can a write happen after a close? */
        if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
-            (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)))
+            (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
-                return -EINVAL;
+                error = -EINVAL;
+                goto out_free;
+        }
        sigfillset(&allsigs);
        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
@@ -619,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file)
        struct dlm_user_proc *proc;
        struct dlm_ls *ls;
-        lock_kernel();
        ls = dlm_find_lockspace_device(iminor(inode));
-        if (!ls) {
+        if (!ls)
-                unlock_kernel();
                return -ENOENT;
-        }
        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
        if (!proc) {
                dlm_put_lockspace(ls);
-                unlock_kernel();
                return -ENOMEM;
        }
@@ -641,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file)
        spin_lock_init(&proc->locks_spin);
        init_waitqueue_head(&proc->wait);
        file->private_data = proc;
-        unlock_kernel();
        return 0;
 }
@@ -874,9 +886,28 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
        return 0;
 }
+int dlm_user_daemon_available(void)
+{
+        /* dlm_controld hasn't started (or, has started, but not
+           properly populated configfs) */
+        if (!dlm_our_nodeid())
+                return 0;
+        /* This is to deal with versions of dlm_controld that don't
+           know about the monitor device.  We assume that if the
+           dlm_controld was started (above), but the monitor device
+           was never opened, that it's an old version.  dlm_controld
+           should open the monitor device before populating configfs. */
+        if (dlm_monitor_unused)
+                return 1;
+        return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
-        cycle_kernel_lock();
        file->private_data = NULL;
        return 0;
 }
@@ -886,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
        return 0;
 }
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+        atomic_inc(&dlm_monitor_opened);
+        dlm_monitor_unused = 0;
+        return 0;
+}
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+        if (atomic_dec_and_test(&dlm_monitor_opened))
+                dlm_stop_lockspaces();
+        return 0;
+}
 static const struct file_operations device_fops = {
        .open    = device_open,
        .release = device_close,
@@ -909,19 +954,42 @@ static struct miscdevice ctl_device = {
        .minor = MISC_DYNAMIC_MINOR,
 };
+static const struct file_operations monitor_device_fops = {
+        .open    = monitor_device_open,
+        .release = monitor_device_close,
+        .owner   = THIS_MODULE,
+};
+static struct miscdevice monitor_device = {
+        .name  = "dlm-monitor",
+        .fops  = &monitor_device_fops,
+        .minor = MISC_DYNAMIC_MINOR,
+};
 int __init dlm_user_init(void)
 {
        int error;
+        atomic_set(&dlm_monitor_opened, 0);
        error = misc_register(&ctl_device);
-        if (error)
+        if (error) {
                log_print("misc_register failed for control device");
+                goto out;
+        }
+        error = misc_register(&monitor_device);
+        if (error) {
+                log_print("misc_register failed for monitor device");
+                misc_deregister(&ctl_device);
+        }
+ out:
        return error;
 }
 void dlm_user_exit(void)
 {
        misc_deregister(&ctl_device);
+        misc_deregister(&monitor_device);
 }
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e4151..35eb6a13d616 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,7 @@
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
 int dlm_user_init(void);
 void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
 #endif
diff --git a/fs/dquot.c b/fs/dquot.c
index 8ec4d6cc7633..ad7e59003e04 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -895,10 +895,9 @@ static void print_warning(struct dquot *dquot, const int warntype)
            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
                return;
-        mutex_lock(&tty_mutex);
        tty = get_current_tty();
        if (!tty)
-                goto out_lock;
+                return;
        tty_write_message(tty, dquot->dq_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
@@ -926,8 +925,7 @@ static void print_warning(struct dquot *dquot, const int warntype)
                        break;
        }
        tty_write_message(tty, msg);
-out_lock:
+        tty_kref_put(tty);
-        mutex_unlock(&tty_mutex);
 }
 #endif
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 448dfd597b5f..8ebe9a5d1d99 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -211,7 +211,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
       ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
        {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
        {ecryptfs_opt_cipher, "cipher=%s"},
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 3a404e7fad53..291abb11e20e 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
        }
        unlock_kernel();
-        d_add(dentry, inode);
+        return d_splice_alias(inode, dentry);
-        return NULL;
 }
 static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 567b134fa1f1..73b19cfc91fc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
                        sb->inode_blocks *
                        (EFS_BLOCKSIZE / sizeof(struct efs_dinode));
        buf->f_ffree   = sb->inode_free;        /* free inodes */
-        buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */
-        buf->f_fsid.val[1] =  sb->fs_magic        & 0xffff; /* fs ID */
        buf->f_namelen = EFS_MAXNAMELEN;        /* max filename length */
        return 0;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0c87474f7917..7cc0eb756b55 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1041,10 +1041,7 @@ retry:
 }
 /*
- * It opens an eventpoll file descriptor. The "size" parameter is there
+ * Open an eventpoll file descriptor.
- * for historical reasons, when epoll was using an hash instead of an
- * RB tree. With the current implementation, the "size" parameter is ignored
- * (besides sanity checks).
 */
 asmlinkage long sys_epoll_create1(int flags)
 {
diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe9..cecee501ce78 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
        task_unlock(tsk);
-        mm_update_next_owner(old_mm);
        arch_pick_mmap_layout(mm);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
                BUG_ON(active_mm != old_mm);
+                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33b..bae998c1e44e 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                       u64 start, u64 len);
 int __ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c36293..45ed07122182 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
 #endif
        .setattr        = ext2_setattr,
        .permission     = ext2_permission,
+        .fiemap         = ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 991d6dfeb51f..7658b33e2653 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/fiemap.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
 }
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                u64 start, u64 len)
+{
+        return generic_block_fiemap(inode, fieinfo, start, len,
+                                    ext2_get_block);
+}
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
        return block_write_full_page(page, ext2_get_block, wbc);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fd88c7b43e66..647cd888ac87 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -393,7 +393,7 @@ enum {
        Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_bsd_df, "bsddf"},
        {Opt_minix_df, "minixdf"},
        {Opt_grpid, "grpid"},
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d3019..3be1e0689c9a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .permission     = ext3_permission,
+        .fiemap         = ext3_fiemap,
 };
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 507d8689b111..ebfec4d0148e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/fiemap.h>
 #include "xattr.h"
 #include "acl.h"
@@ -981,6 +982,13 @@ out:
        return ret;
 }
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                u64 start, u64 len)
+{
+        return generic_block_fiemap(inode, fieinfo, start, len,
+                                    ext3_get_block);
+}
 /*
 * `handle' can be NULL if create is zero
 */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f38a5afc39a1..399a96a6c556 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -760,7 +760,7 @@ enum {
        Opt_grpquota
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_bsd_df, "bsddf"},
        {Opt_minix_df, "minixdf"},
        {Opt_grpid, "grpid"},
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2f..a8ff003a00f7 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
 # Makefile for the linux ext4-filesystem routines.
 #
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4dev-y       := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                   ext4_jbd2.o migrate.o mballoc.o
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)      += xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)  += acl.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)   += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)         += xattr_security.o
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index cd2b855a07d6..cb45257a246e 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
        }
 }
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
   if the ACL has not been cached */
 #define EXT4_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
-extern int ext4_permission (struct inode *, int);
+extern int ext4_permission(struct inode *, int);
-extern int ext4_acl_chmod (struct inode *);
+extern int ext4_acl_chmod(struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
-#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_permission NULL
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
        return 0;
 }
-#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1ae5004e93fc..bd2ece228827 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
        }
        return used_blocks;
 }
 /* Initializes an uninitialized block bitmap if given, and returns the
 * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
                        le32_to_cpu(sbi->s_es->s_first_data_block) -
-                        (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
+                        (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 * @bh:                 pointer to the buffer head to store the block
 *                      group descriptor
 */
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
-                                             struct buffer_head ** bh)
+                                             struct buffer_head **bh)
 {
        unsigned long group_desc;
        unsigned long offset;
-        struct ext4_group_desc * desc;
+        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (block_group >= sbi->s_groups_count) {
-                ext4_error (sb, "ext4_get_group_desc",
+                ext4_error(sb, "ext4_get_group_desc",
-                            "block_group >= groups_count - "
+                           "block_group >= groups_count - "
-                            "block_group = %lu, groups_count = %lu",
+                           "block_group = %lu, groups_count = %lu",
-                            block_group, sbi->s_groups_count);
+                           block_group, sbi->s_groups_count);
                return NULL;
        }
@@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        if (!sbi->s_group_desc[group_desc]) {
-                ext4_error (sb, "ext4_get_group_desc",
+                ext4_error(sb, "ext4_get_group_desc",
-                            "Group descriptor not loaded - "
+                           "Group descriptor not loaded - "
-                            "block_group = %lu, group_desc = %lu, desc = %lu",
+                           "block_group = %lu, group_desc = %lu, desc = %lu",
-                             block_group, group_desc, offset);
+                           block_group, group_desc, offset);
                return NULL;
        }
@@ -302,8 +303,8 @@ err_out:
 struct buffer_head *
 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
-        struct ext4_group_desc * desc;
+        struct ext4_group_desc *desc;
-        struct buffer_head * bh = NULL;
+        struct buffer_head *bh = NULL;
        ext4_fsblk_t bitmap_blk;
        desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -318,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                            block_group, bitmap_blk);
                return NULL;
        }
-        if (bh_uptodate_or_lock(bh))
+        if (buffer_uptodate(bh) &&
+            !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
                return bh;
+        lock_buffer(bh);
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
@@ -345,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
         */
        return bh;
 }
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root:            root of per-filesystem reservation rb tree
- * @verbose:            verbose mode
- * @fn:                 function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end). Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
-                              const char *fn)
-{
-        struct rb_node *n;
-        struct ext4_reserve_window_node *rsv, *prev;
-        int bad;
-restart:
-        n = rb_first(root);
-        bad = 0;
-        prev = NULL;
-        printk("Block Allocation Reservation Windows Map (%s):\n", fn);
-        while (n) {
-                rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-                if (verbose)
-                        printk("reservation window 0x%p "
-                               "start:  %llu, end:  %llu\n",
-                               rsv, rsv->rsv_start, rsv->rsv_end);
-                if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-                        printk("Bad reservation %p (start >= end)\n",
-                               rsv);
-                        bad = 1;
-                }
-                if (prev && prev->rsv_end >= rsv->rsv_start) {
-                        printk("Bad reservation %p (prev->end >= start)\n",
-                               rsv);
-                        bad = 1;
-                }
-                if (bad) {
-                        if (!verbose) {
-                                printk("Restarting reservation walk in verbose mode\n");
-                                verbose = 1;
-                                goto restart;
-                        }
-                }
-                n = rb_next(n);
-                prev = rsv;
-        }
-        printk("Window map complete.\n");
-        BUG_ON(bad);
-}
-#define rsv_window_dump(root, verbose) \
-        __rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-/**
- * goal_in_my_reservation()
- * @rsv:                inode's reservation window
- * @grp_goal:           given goal block relative to the allocation block group
- * @group:              the current allocation block group
- * @sb:                 filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-                        ext4_group_t group, struct super_block *sb)
-{
-        ext4_fsblk_t group_first_block, group_last_block;
-        group_first_block = ext4_group_first_block_no(sb, group);
-        group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-        if ((rsv->_rsv_start > group_last_block) ||
-            (rsv->_rsv_end < group_first_block))
-                return 0;
-        if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-                || (grp_goal + group_first_block > rsv->_rsv_end)))
-                return 0;
-        return 1;
-}
-/**
- * search_reserve_window()
- * @rb_root:            root of reservation tree
- * @goal:               target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
-        struct rb_node *n = root->rb_node;
-        struct ext4_reserve_window_node *rsv;
-        if (!n)
-                return NULL;
-        do {
-                rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-                if (goal < rsv->rsv_start)
-                        n = n->rb_left;
-                else if (goal > rsv->rsv_end)
-                        n = n->rb_right;
-                else
-                        return rsv;
-        } while (n);
-        /*
-         * We've fallen off the end of the tree: the goal wasn't inside
-         * any particular node.  OK, the previous node must be to one
-         * side of the interval containing the goal.  If it's the RHS,
-         * we need to back up one.
-         */
-        if (rsv->rsv_start > goal) {
-                n = rb_prev(&rsv->rsv_node);
-                rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-        }
-        return rsv;
-}
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb:                 super block
- * @rsv:                reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
-                    struct ext4_reserve_window_node *rsv)
-{
-        struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
-        struct rb_node *node = &rsv->rsv_node;
-        ext4_fsblk_t start = rsv->rsv_start;
-        struct rb_node ** p = &root->rb_node;
-        struct rb_node * parent = NULL;
-        struct ext4_reserve_window_node *this;
-        while (*p)
-        {
-                parent = *p;
-                this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-                if (start < this->rsv_start)
-                        p = &(*p)->rb_left;
-                else if (start > this->rsv_end)
-                        p = &(*p)->rb_right;
-                else {
-                        rsv_window_dump(root, 1);
-                        BUG();
-                }
-        }
-        rb_link_node(node, parent, p);
-        rb_insert_color(node, root);
-}
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb:                 super block
- * @rsv:                reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
-                              struct ext4_reserve_window_node *rsv)
-{
-        rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-        rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-        rsv->rsv_alloc_hit = 0;
-        rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv:                given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
-        /* a valid reservation end block could not be 0 */
-        return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-/**
- * ext4_init_block_alloc_info()
- * @inode:              file inode structure
- *
- * Allocate and initialize the  reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-        struct super_block *sb = inode->i_sb;
-        block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-        if (block_i) {
-                struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-                rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-                rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-                /*
-                 * if filesystem is mounted with NORESERVATION, the goal
-                 * reservation window size is set to zero to indicate
-                 * block reservation is off
-                 */
-                if (!test_opt(sb, RESERVATION))
-                        rsv->rsv_goal_size = 0;
-                else
-                        rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
-                rsv->rsv_alloc_hit = 0;
-                block_i->last_alloc_logical_block = 0;
-                block_i->last_alloc_physical_block = 0;
-        }
-        ei->i_block_alloc_info = block_i;
-}
-/**
- * ext4_discard_reservation()
- * @inode:              inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- *      ext4_release_file(): last writer close the file
- *      ext4_clear_inode(): last iput(), when nobody link to this file.
- *      ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-        struct ext4_reserve_window_node *rsv;
-        spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-        ext4_mb_discard_inode_preallocations(inode);
-        if (!block_i)
-                return;
-        rsv = &block_i->rsv_window_node;
-        if (!rsv_is_empty(&rsv->rsv_window)) {
-                spin_lock(rsv_lock);
-                if (!rsv_is_empty(&rsv->rsv_window))
-                        rsv_window_remove(inode->i_sb, rsv);
-                spin_unlock(rsv_lock);
-        }
-}
 /**
 * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -648,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
 * @block:                      start physcial block to free
 * @count:                      number of blocks to free
 * @pdquot_freed_blocks:        pointer to quota
+ *
+ * XXX This function is only used by the on-line resizing code, which
+ * should probably be fixed up to call the mballoc variant.  There
+ * this needs to be cleaned up later; in fact, I'm not convinced this
+ * is 100% correct in the face of the mballoc code.  The online resizing
+ * code needs to be fixed up to more tightly (and correctly) interlock
+ * with the mballoc code.
 */
 void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
                         ext4_fsblk_t block, unsigned long count,
@@ -659,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
        ext4_grpblk_t bit;
        unsigned long i;
        unsigned long overflow;
-        struct ext4_group_desc * desc;
+        struct ext4_group_desc *desc;
-        struct ext4_super_block * es;
+        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int err = 0, ret;
        ext4_grpblk_t group_freed;
@@ -671,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
        if (block < le32_to_cpu(es->s_first_data_block) ||
            block + count < block ||
            block + count > ext4_blocks_count(es)) {
-                ext4_error (sb, "ext4_free_blocks",
+                ext4_error(sb, "ext4_free_blocks",
-                            "Freeing blocks not in datazone - "
+                           "Freeing blocks not in datazone - "
-                            "block = %llu, count = %lu", block, count);
+                           "block = %llu, count = %lu", block, count);
                goto error_return;
        }
-        ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+        ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
 do_more:
        overflow = 0;
@@ -694,7 +409,7 @@ do_more:
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
-        desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
        if (!desc)
                goto error_return;
@@ -703,10 +418,10 @@ do_more:
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-                ext4_error (sb, "ext4_free_blocks",
+                ext4_error(sb, "ext4_free_blocks",
-                            "Freeing blocks in system zones - "
+                           "Freeing blocks in system zones - "
-                            "Block = %llu, count = %lu",
+                           "Block = %llu, count = %lu",
-                            block, count);
+                           block, count);
                goto error_return;
        }
@@ -848,7 +563,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count,
                        int metadata)
 {
-        struct super_block * sb;
+        struct super_block *sb;
        unsigned long dquot_freed_blocks;
        /* this isn't the right place to decide whether block is metadata
@@ -859,748 +574,52 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        sb = inode->i_sb;
-        if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
+        ext4_mb_free_blocks(handle, inode, block, count,
-                ext4_free_blocks_sb(handle, sb, block, count,
+                            metadata, &dquot_freed_blocks);
-                                                &dquot_freed_blocks);
-        else
-                ext4_mb_free_blocks(handle, inode, block, count,
-                                                metadata, &dquot_freed_blocks);
        if (dquot_freed_blocks)
                DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
        return;
 }
-/**
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- * ext4_test_allocatable()
+                                                s64 nblocks)
- * @nr:                 given allocation block group
- * @bh:                 bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy.  This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
-{
-        int ret;
-        struct journal_head *jh = bh2jh(bh);
-        if (ext4_test_bit(nr, bh->b_data))
-                return 0;
-        jbd_lock_bh_state(bh);
-        if (!jh->b_committed_data)
-                ret = 1;
-        else
-                ret = !ext4_test_bit(nr, jh->b_committed_data);
-        jbd_unlock_bh_state(bh);
-        return ret;
-}
-/**
- * bitmap_search_next_usable_block()
- * @start:              the starting block (group relative) of the search
- * @bh:                 bufferhead contains the block group bitmap
- * @maxblocks:          the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-                                        ext4_grpblk_t maxblocks)
 {
-        ext4_grpblk_t next;
+        s64 free_blocks, dirty_blocks;
-        struct journal_head *jh = bh2jh(bh);
+        s64 root_blocks = 0;
+        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-        while (start < maxblocks) {
+        struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-                next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
-                if (next >= maxblocks)
-                        return -1;
-                if (ext4_test_allocatable(next, bh))
-                        return next;
-                jbd_lock_bh_state(bh);
-                if (jh->b_committed_data)
-                        start = ext4_find_next_zero_bit(jh->b_committed_data,
-                                                        maxblocks, next);
-                jbd_unlock_bh_state(bh);
-        }
-        return -1;
-}
-/**
+        free_blocks  = percpu_counter_read_positive(fbc);
- * find_next_usable_block()
+        dirty_blocks = percpu_counter_read_positive(dbc);
- * @start:              the starting block (group relative) to find next
- *                      allocatable block in bitmap.
- * @bh:                 bufferhead contains the block group bitmap
- * @maxblocks:          the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap.  We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-                        ext4_grpblk_t maxblocks)
-{
-        ext4_grpblk_t here, next;
-        char *p, *r;
-        if (start > 0) {
-                /*
-                 * The goal was occupied; search forward for a free
-                 * block within the next XX blocks.
-                 *
-                 * end_goal is more or less random, but it has to be
-                 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
-                 * next 64-bit boundary is simple..
-                 */
-                ext4_grpblk_t end_goal = (start + 63) & ~63;
-                if (end_goal > maxblocks)
-                        end_goal = maxblocks;
-                here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
-                if (here < end_goal && ext4_test_allocatable(here, bh))
-                        return here;
-                ext4_debug("Bit not found near goal\n");
-        }
-        here = start;
-        if (here < 0)
-                here = 0;
-        p = ((char *)bh->b_data) + (here >> 3);
-        r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-        next = (r - ((char *)bh->b_data)) << 3;
-        if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
-                return next;
-        /*
-         * The bitmap search --- search forward alternately through the actual
-         * bitmap and the last-committed copy until we find a bit free in
-         * both
-         */
-        here = bitmap_search_next_usable_block(here, bh, maxblocks);
-        return here;
-}
-/**
- * claim_block()
- * @block:              the free block (group relative) to allocate
- * @bh:                 the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap.  Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data.  If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
-{
-        struct journal_head *jh = bh2jh(bh);
-        int ret;
-        if (ext4_set_bit_atomic(lock, block, bh->b_data))
-                return 0;
-        jbd_lock_bh_state(bh);
-        if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
-                ext4_clear_bit_atomic(lock, block, bh->b_data);
-                ret = 0;
-        } else {
-                ret = 1;
-        }
-        jbd_unlock_bh_state(bh);
-        return ret;
-}
-/**
+        if (!capable(CAP_SYS_RESOURCE) &&
- * ext4_try_to_allocate()
+                sbi->s_resuid != current->fsuid &&
- * @sb:                 superblock
+                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
- * @handle:             handle to this transaction
+                root_blocks = ext4_r_blocks_count(sbi->s_es);
- * @group:              given allocation block group
- * @bitmap_bh:          bufferhead holds the block bitmap
- * @grp_goal:           given target block within the group
- * @count:              target number of blocks to allocate
- * @my_rsv:             reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- *      if there is a reservation window, only try to allocate block(s) from the
- *      file's own reservation window;
- *      Otherwise, the allocation range starts from the give goal block, ends at
- *      the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
-                        ext4_group_t group, struct buffer_head *bitmap_bh,
-                        ext4_grpblk_t grp_goal, unsigned long *count,
-                        struct ext4_reserve_window *my_rsv)
-{
-        ext4_fsblk_t group_first_block;
-        ext4_grpblk_t start, end;
-        unsigned long num = 0;
-        /* we do allocation within the reservation window if we have a window */
-        if (my_rsv) {
-                group_first_block = ext4_group_first_block_no(sb, group);
-                if (my_rsv->_rsv_start >= group_first_block)
-                        start = my_rsv->_rsv_start - group_first_block;
-                else
-                        /* reservation window cross group boundary */
-                        start = 0;
-                end = my_rsv->_rsv_end - group_first_block + 1;
-                if (end > EXT4_BLOCKS_PER_GROUP(sb))
-                        /* reservation window crosses group boundary */
-                        end = EXT4_BLOCKS_PER_GROUP(sb);
-                if ((start <= grp_goal) && (grp_goal < end))
-                        start = grp_goal;
-                else
-                        grp_goal = -1;
-        } else {
-                if (grp_goal > 0)
-                        start = grp_goal;
-                else
-                        start = 0;
-                end = EXT4_BLOCKS_PER_GROUP(sb);
-        }
-        BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-repeat:
-        if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
-                grp_goal = find_next_usable_block(start, bitmap_bh, end);
-                if (grp_goal < 0)
-                        goto fail_access;
-                if (!my_rsv) {
-                        int i;
-                        for (i = 0; i < 7 && grp_goal > start &&
-                                        ext4_test_allocatable(grp_goal - 1,
-                                                                bitmap_bh);
-                                        i++, grp_goal--)
-                                ;
-                }
-        }
-        start = grp_goal;
-        if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-                grp_goal, bitmap_bh)) {
-                /*
-                 * The block was allocated by another thread, or it was
-                 * allocated and then freed by another thread
-                 */
-                start++;
-                grp_goal++;
-                if (start >= end)
-                        goto fail_access;
-                goto repeat;
-        }
-        num++;
-        grp_goal++;
-        while (num < *count && grp_goal < end
-                && ext4_test_allocatable(grp_goal, bitmap_bh)
-                && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-                                grp_goal, bitmap_bh)) {
-                num++;
-                grp_goal++;
-        }
-        *count = num;
-        return grp_goal - num;
-fail_access:
-        *count = num;
-        return -1;
-}
-/**
- *      find_next_reservable_window():
- *              find a reservable space within the given range.
- *              It does not allocate the reservation window for now:
- *              alloc_new_reservation() will do the work later.
- *
- *      @search_head: the head of the searching list;
- *              This is not necessarily the list head of the whole filesystem
- *
- *              We have both head and start_block to assist the search
- *              for the reservable space. The list starts from head,
- *              but we will shift to the place where start_block is,
- *              then start from there, when looking for a reservable space.
- *
- *      @size: the target new reservation window size
- *
- *      @group_first_block: the first block we consider to start
- *                      the real search from
- *
- *      @last_block:
- *              the maximum block number that our goal reservable space
- *              could start from. This is normally the last block in this
- *              group. The search will end when we found the start of next
- *              possible reservable space is out of this boundary.
- *              This could handle the cross boundary reservation window
- *              request.
- *
- *      basically we search from the given range, rather than the whole
- *      reservation double linked list, (start_block, last_block)
- *      to find a free region that is of my size and has not
- *      been reserved.
- *
- */
-static int find_next_reservable_window(
-                                struct ext4_reserve_window_node *search_head,
-                                struct ext4_reserve_window_node *my_rsv,
-                                struct super_block * sb,
-                                ext4_fsblk_t start_block,
-                                ext4_fsblk_t last_block)
-{
-        struct rb_node *next;
-        struct ext4_reserve_window_node *rsv, *prev;
-        ext4_fsblk_t cur;
-        int size = my_rsv->rsv_goal_size;
-        /* TODO: make the start of the reservation window byte-aligned */
-        /* cur = *start_block & ~7;*/
-        cur = start_block;
-        rsv = search_head;
-        if (!rsv)
-                return -1;
-        while (1) {
-                if (cur <= rsv->rsv_end)
-                        cur = rsv->rsv_end + 1;
-                /* TODO?
-                 * in the case we could not find a reservable space
-                 * that is what is expected, during the re-search, we could
-                 * remember what's the largest reservable space we could have
-                 * and return that one.
-                 *
-                 * For now it will fail if we could not find the reservable
-                 * space with expected-size (or more)...
-                 */
-                if (cur > last_block)
-                        return -1;              /* fail */
-                prev = rsv;
-                next = rb_next(&rsv->rsv_node);
-                rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
-                /*
+        if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-                 * Reached the last reservation, we can just append to the
+                                                EXT4_FREEBLOCKS_WATERMARK) {
-                 * previous one.
+                free_blocks  = percpu_counter_sum(fbc);
-                 */
+                dirty_blocks = percpu_counter_sum(dbc);
-                if (!next)
+                if (dirty_blocks < 0) {
-                        break;
+                        printk(KERN_CRIT "Dirty block accounting "
+                                        "went wrong %lld\n",
-                if (cur + size <= rsv->rsv_start) {
+                                        dirty_blocks);
-                        /*
-                         * Found a reserveable space big enough.  We could
-                         * have a reservation across the group boundary here
-                         */
-                        break;
                }
        }
-        /*
+        /* Check whether we have space after
-         * we come here either :
+         * accounting for current dirty blocks
-         * when we reach the end of the whole list,
-         * and there is empty reservable space after last entry in the list.
-         * append it to the end of the list.
-         *
-         * or we found one reservable space in the middle of the list,
-         * return the reservation window that we could append to.
-         * succeed.
         */
+        if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
+                /* we don't have free space */
+                return -ENOSPC;
-        if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+        /* Add the blocks to nblocks */
-                rsv_window_remove(sb, my_rsv);
+        percpu_counter_add(dbc, nblocks);
-        /*
-         * Let's book the whole avaliable window for now.  We will check the
-         * disk bitmap later and then, if there are free blocks then we adjust
-         * the window size if it's larger than requested.
-         * Otherwise, we will remove this node from the tree next time
-         * call find_next_reservable_window.
-         */
-        my_rsv->rsv_start = cur;
-        my_rsv->rsv_end = cur + size - 1;
-        my_rsv->rsv_alloc_hit = 0;
-        if (prev != my_rsv)
-                ext4_rsv_window_add(sb, my_rsv);
        return 0;
 }
 /**
- *      alloc_new_reservation()--allocate a new reservation window
- *
- *              To make a new reservation, we search part of the filesystem
- *              reservation list (the list that inside the group). We try to
- *              allocate a new reservation window near the allocation goal,
- *              or the beginning of the group, if there is no goal.
- *
- *              We first find a reservable space after the goal, then from
- *              there, we check the bitmap for the first free block after
- *              it. If there is no free block until the end of group, then the
- *              whole group is full, we failed. Otherwise, check if the free
- *              block is inside the expected reservable space, if so, we
- *              succeed.
- *              If the first free block is outside the reservable space, then
- *              start from the first free block, we search for next available
- *              space, and go on.
- *
- *      on succeed, a new reservation will be found and inserted into the list
- *      It contains at least one free block, and it does not overlap with other
- *      reservation windows.
- *
- *      failed: we failed to find a reservation window in this group
- *
- *      @rsv: the reservation
- *
- *      @grp_goal: The goal (group-relative).  It is where the search for a
- *              free reservable space should start from.
- *              if we have a grp_goal(grp_goal >0 ), then start from there,
- *              no grp_goal(grp_goal = -1), we start from the first block
- *              of the group.
- *
- *      @sb: the super block
- *      @group: the group we are trying to allocate in
- *      @bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
-                ext4_grpblk_t grp_goal, struct super_block *sb,
-                ext4_group_t group, struct buffer_head *bitmap_bh)
-{
-        struct ext4_reserve_window_node *search_head;
-        ext4_fsblk_t group_first_block, group_end_block, start_block;
-        ext4_grpblk_t first_free_block;
-        struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
-        unsigned long size;
-        int ret;
-        spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-        group_first_block = ext4_group_first_block_no(sb, group);
-        group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-        if (grp_goal < 0)
-                start_block = group_first_block;
-        else
-                start_block = grp_goal + group_first_block;
-        size = my_rsv->rsv_goal_size;
-        if (!rsv_is_empty(&my_rsv->rsv_window)) {
-                /*
-                 * if the old reservation is cross group boundary
-                 * and if the goal is inside the old reservation window,
-                 * we will come here when we just failed to allocate from
-                 * the first part of the window. We still have another part
-                 * that belongs to the next group. In this case, there is no
-                 * point to discard our window and try to allocate a new one
-                 * in this group(which will fail). we should
-                 * keep the reservation window, just simply move on.
-                 *
-                 * Maybe we could shift the start block of the reservation
-                 * window to the first block of next group.
-                 */
-                if ((my_rsv->rsv_start <= group_end_block) &&
-                                (my_rsv->rsv_end > group_end_block) &&
-                                (start_block >= my_rsv->rsv_start))
-                        return -1;
-                if ((my_rsv->rsv_alloc_hit >
-                     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
-                        /*
-                         * if the previously allocation hit ratio is
-                         * greater than 1/2, then we double the size of
-                         * the reservation window the next time,
-                         * otherwise we keep the same size window
-                         */
-                        size = size * 2;
-                        if (size > EXT4_MAX_RESERVE_BLOCKS)
-                                size = EXT4_MAX_RESERVE_BLOCKS;
-                        my_rsv->rsv_goal_size= size;
-                }
-        }
-        spin_lock(rsv_lock);
-        /*
-         * shift the search start to the window near the goal block
-         */
-        search_head = search_reserve_window(fs_rsv_root, start_block);
-        /*
-         * find_next_reservable_window() simply finds a reservable window
-         * inside the given range(start_block, group_end_block).
-         *
-         * To make sure the reservation window has a free bit inside it, we
-         * need to check the bitmap after we found a reservable window.
-         */
-retry:
-        ret = find_next_reservable_window(search_head, my_rsv, sb,
-                                                start_block, group_end_block);
-        if (ret == -1) {
-                if (!rsv_is_empty(&my_rsv->rsv_window))
-                        rsv_window_remove(sb, my_rsv);
-                spin_unlock(rsv_lock);
-                return -1;
-        }
-        /*
-         * On success, find_next_reservable_window() returns the
-         * reservation window where there is a reservable space after it.
-         * Before we reserve this reservable space, we need
-         * to make sure there is at least a free block inside this region.
-         *
-         * searching the first free bit on the block bitmap and copy of
-         * last committed bitmap alternatively, until we found a allocatable
-         * block. Search start from the start block of the reservable space
-         * we just found.
-         */
-        spin_unlock(rsv_lock);
-        first_free_block = bitmap_search_next_usable_block(
-                        my_rsv->rsv_start - group_first_block,
-                        bitmap_bh, group_end_block - group_first_block + 1);
-        if (first_free_block < 0) {
-                /*
-                 * no free block left on the bitmap, no point
-                 * to reserve the space. return failed.
-                 */
-                spin_lock(rsv_lock);
-                if (!rsv_is_empty(&my_rsv->rsv_window))
-                        rsv_window_remove(sb, my_rsv);
-                spin_unlock(rsv_lock);
-                return -1;              /* failed */
-        }
-        start_block = first_free_block + group_first_block;
-        /*
-         * check if the first free block is within the
-         * free space we just reserved
-         */
-        if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
-                return 0;               /* success */
-        /*
-         * if the first free bit we found is out of the reservable space
-         * continue search for next reservable space,
-         * start from where the free block is,
-         * we also shift the list head to where we stopped last time
-         */
-        search_head = my_rsv;
-        spin_lock(rsv_lock);
-        goto retry;
-}
-/**
- * try_to_extend_reservation()
- * @my_rsv:             given reservation window
- * @sb:                 super block
- * @size:               the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
-                        struct super_block *sb, int size)
-{
-        struct ext4_reserve_window_node *next_rsv;
-        struct rb_node *next;
-        spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-        if (!spin_trylock(rsv_lock))
-                return;
-        next = rb_next(&my_rsv->rsv_node);
-        if (!next)
-                my_rsv->rsv_end += size;
-        else {
-                next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-                if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
-                        my_rsv->rsv_end += size;
-                else
-                        my_rsv->rsv_end = next_rsv->rsv_start - 1;
-        }
-        spin_unlock(rsv_lock);
-}
-/**
- * ext4_try_to_allocate_with_rsv()
- * @sb:                 superblock
- * @handle:             handle to this transaction
- * @group:              given allocation block group
- * @bitmap_bh:          bufferhead holds the block bitmap
- * @grp_goal:           given target block within the group
- * @count:              target number of blocks to allocate
- * @my_rsv:             reservation window
- * @errp:               pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation.  If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
- *
- */
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-                        ext4_group_t group, struct buffer_head *bitmap_bh,
-                        ext4_grpblk_t grp_goal,
-                        struct ext4_reserve_window_node * my_rsv,
-                        unsigned long *count, int *errp)
-{
-        ext4_fsblk_t group_first_block, group_last_block;
-        ext4_grpblk_t ret = 0;
-        int fatal;
-        unsigned long num = *count;
-        *errp = 0;
-        /*
-         * Make sure we use undo access for the bitmap, because it is critical
-         * that we do the frozen_data COW on bitmap buffers in all cases even
-         * if the buffer is in BJ_Forget state in the committing transaction.
-         */
-        BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-        fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
-        if (fatal) {
-                *errp = fatal;
-                return -1;
-        }
-        /*
-         * we don't deal with reservation when
-         * filesystem is mounted without reservation
-         * or the file is not a regular file
-         * or last attempt to allocate a block with reservation turned on failed
-         */
-        if (my_rsv == NULL ) {
-                ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-                                                grp_goal, count, NULL);
-                goto out;
-        }
-        /*
-         * grp_goal is a group relative block number (if there is a goal)
-         * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
-         * first block is a filesystem wide block number
-         * first block is the block number of the first block in this group
-         */
-        group_first_block = ext4_group_first_block_no(sb, group);
-        group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-        /*
-         * Basically we will allocate a new block from inode's reservation
-         * window.
-         *
-         * We need to allocate a new reservation window, if:
-         * a) inode does not have a reservation window; or
-         * b) last attempt to allocate a block from existing reservation
-         *    failed; or
-         * c) we come here with a goal and with a reservation window
-         *
-         * We do not need to allocate a new reservation window if we come here
-         * at the beginning with a goal and the goal is inside the window, or
-         * we don't have a goal but already have a reservation window.
-         * then we could go to allocate from the reservation window directly.
-         */
-        while (1) {
-                if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-                        !goal_in_my_reservation(&my_rsv->rsv_window,
-                                                grp_goal, group, sb)) {
-                        if (my_rsv->rsv_goal_size < *count)
-                                my_rsv->rsv_goal_size = *count;
-                        ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-                                                        group, bitmap_bh);
-                        if (ret < 0)
-                                break;                  /* failed */
-                        if (!goal_in_my_reservation(&my_rsv->rsv_window,
-                                                        grp_goal, group, sb))
-                                grp_goal = -1;
-                } else if (grp_goal >= 0) {
-                        int curr = my_rsv->rsv_end -
-                                        (grp_goal + group_first_block) + 1;
-                        if (curr < *count)
-                                try_to_extend_reservation(my_rsv, sb,
-                                                        *count - curr);
-                }
-                if ((my_rsv->rsv_start > group_last_block) ||
-                                (my_rsv->rsv_end < group_first_block)) {
-                        rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
-                        BUG();
-                }
-                ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-                                           grp_goal, &num, &my_rsv->rsv_window);
-                if (ret >= 0) {
-                        my_rsv->rsv_alloc_hit += num;
-                        *count = num;
-                        break;                          /* succeed */
-                }
-                num = *count;
-        }
-out:
-        if (ret >= 0) {
-                BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-                                        "bitmap block");
-                fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
-                if (fatal) {
-                        *errp = fatal;
-                        return -1;
-                }
-                return ret;
-        }
-        BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-        ext4_journal_release_buffer(handle, bitmap_bh);
-        return ret;
-}
-/**
 * ext4_has_free_blocks()
 * @sbi:        in-core super block structure.
 * @nblocks:    number of neeed blocks
@@ -1610,26 +629,34 @@ out:
 * On success, return nblocks
 */
 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-                                                ext4_fsblk_t nblocks)
+                                                s64 nblocks)
 {
-        ext4_fsblk_t free_blocks;
+        s64 free_blocks, dirty_blocks;
-        ext4_fsblk_t root_blocks = 0;
+        s64 root_blocks = 0;
+        struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+        struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        free_blocks  = percpu_counter_read_positive(fbc);
+        dirty_blocks = percpu_counter_read_positive(dbc);
        if (!capable(CAP_SYS_RESOURCE) &&
                sbi->s_resuid != current->fsuid &&
                (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
                root_blocks = ext4_r_blocks_count(sbi->s_es);
-#ifdef CONFIG_SMP
-        if (free_blocks - root_blocks < FBC_BATCH)
+        if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-                free_blocks =
+                                                EXT4_FREEBLOCKS_WATERMARK) {
-                        percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+                free_blocks  = percpu_counter_sum(fbc);
-#endif
+                dirty_blocks = percpu_counter_sum(dbc);
-        if (free_blocks - root_blocks < nblocks)
+        }
-                return free_blocks - root_blocks;
+        if (free_blocks <= (root_blocks + dirty_blocks))
+                /* we don't have free space */
+                return 0;
+        if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
+                return free_blocks - (root_blocks + dirty_blocks);
        return nblocks;
- }
+}
 /**
@@ -1654,303 +681,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
        return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
-/**
- * ext4_old_new_blocks() -- core block bitmap based block allocation function
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @count:              target number of blocks to allocate
- * @errp:               error code
- *
- * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * the block bitmap directly to do block allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If
- * that fails, it will try to allocate block(s) from other block groups
- * without any specific goal block.
- *
- * This function is called when -o nomballoc mount option is enabled
- *
- */
-ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-        struct buffer_head *bitmap_bh = NULL;
-        struct buffer_head *gdp_bh;
-        ext4_group_t group_no;
-        ext4_group_t goal_group;
-        ext4_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
-        ext4_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
-        ext4_fsblk_t ret_block;         /* filesyetem-wide allocated block */
-        ext4_group_t bgi;                       /* blockgroup iteration index */
-        int fatal = 0, err;
-        int performed_allocation = 0;
-        ext4_grpblk_t free_blocks;      /* number of free blocks in a group */
-        struct super_block *sb;
-        struct ext4_group_desc *gdp;
-        struct ext4_super_block *es;
-        struct ext4_sb_info *sbi;
-        struct ext4_reserve_window_node *my_rsv = NULL;
-        struct ext4_block_alloc_info *block_i;
-        unsigned short windowsz = 0;
-        ext4_group_t ngroups;
-        unsigned long num = *count;
-        sb = inode->i_sb;
-        if (!sb) {
-                *errp = -ENODEV;
-                printk("ext4_new_block: nonexistent device");
-                return 0;
-        }
-        sbi = EXT4_SB(sb);
-        if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
-                /*
-                 * With delalloc we already reserved the blocks
-                 */
-                *count = ext4_has_free_blocks(sbi, *count);
-        }
-        if (*count == 0) {
-                *errp = -ENOSPC;
-                return 0;       /*return with ENOSPC error */
-        }
-        num = *count;
-        /*
-         * Check quota for allocation of this block.
-         */
-        if (DQUOT_ALLOC_BLOCK(inode, num)) {
-                *errp = -EDQUOT;
-                return 0;
-        }
-        sbi = EXT4_SB(sb);
-        es = EXT4_SB(sb)->s_es;
-        ext4_debug("goal=%llu.\n", goal);
-        /*
-         * Allocate a block from reservation only when
-         * filesystem is mounted with reservation(default,-o reservation), and
-         * it's a regular file, and
-         * the desired window size is greater than 0 (One could use ioctl
-         * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
-         * reservation on that particular file)
-         */
-        block_i = EXT4_I(inode)->i_block_alloc_info;
-        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
-                my_rsv = &block_i->rsv_window_node;
-        /*
-         * First, test whether the goal block is free.
-         */
-        if (goal < le32_to_cpu(es->s_first_data_block) ||
-            goal >= ext4_blocks_count(es))
-                goal = le32_to_cpu(es->s_first_data_block);
-        ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
-        goal_group = group_no;
-retry_alloc:
-        gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-        if (!gdp)
-                goto io_error;
-        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-        /*
-         * if there is not enough free blocks to make a new resevation
-         * turn off reservation for this allocation
-         */
-        if (my_rsv && (free_blocks < windowsz)
-                && (rsv_is_empty(&my_rsv->rsv_window)))
-                my_rsv = NULL;
-        if (free_blocks > 0) {
-                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-                if (!bitmap_bh)
-                        goto io_error;
-                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-                                        group_no, bitmap_bh, grp_target_blk,
-                                        my_rsv, &num, &fatal);
-                if (fatal)
-                        goto out;
-                if (grp_alloc_blk >= 0)
-                        goto allocated;
-        }
-        ngroups = EXT4_SB(sb)->s_groups_count;
-        smp_rmb();
-        /*
-         * Now search the rest of the groups.  We assume that
-         * group_no and gdp correctly point to the last group visited.
-         */
-        for (bgi = 0; bgi < ngroups; bgi++) {
-                group_no++;
-                if (group_no >= ngroups)
-                        group_no = 0;
-                gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-                if (!gdp)
-                        goto io_error;
-                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-                /*
-                 * skip this group if the number of
-                 * free blocks is less than half of the reservation
-                 * window size.
-                 */
-                if (free_blocks <= (windowsz/2))
-                        continue;
-                brelse(bitmap_bh);
-                bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-                if (!bitmap_bh)
-                        goto io_error;
-                /*
-                 * try to allocate block(s) from this group, without a goal(-1).
-                 */
-                grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-                                        group_no, bitmap_bh, -1, my_rsv,
-                                        &num, &fatal);
-                if (fatal)
-                        goto out;
-                if (grp_alloc_blk >= 0)
-                        goto allocated;
-        }
-        /*
-         * We may end up a bogus ealier ENOSPC error due to
-         * filesystem is "full" of reservations, but
-         * there maybe indeed free blocks avaliable on disk
-         * In this case, we just forget about the reservations
-         * just do block allocation as without reservations.
-         */
-        if (my_rsv) {
-                my_rsv = NULL;
-                windowsz = 0;
-                group_no = goal_group;
-                goto retry_alloc;
-        }
-        /* No space left on the device */
-        *errp = -ENOSPC;
-        goto out;
-allocated:
-        ext4_debug("using block group %lu(%d)\n",
-                        group_no, gdp->bg_free_blocks_count);
-        BUFFER_TRACE(gdp_bh, "get_write_access");
-        fatal = ext4_journal_get_write_access(handle, gdp_bh);
-        if (fatal)
-                goto out;
-        ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-        if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
-            in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
-            in_range(ret_block, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group) ||
-            in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group)) {
-                ext4_error(sb, "ext4_new_block",
-                            "Allocating block in system zone - "
-                            "blocks from %llu, length %lu",
-                             ret_block, num);
-                /*
-                 * claim_block marked the blocks we allocated
-                 * as in use. So we may want to selectively
-                 * mark some of the blocks as free
-                 */
-                goto retry_alloc;
-        }
-        performed_allocation = 1;
-#ifdef CONFIG_JBD2_DEBUG
-        {
-                struct buffer_head *debug_bh;
-                /* Record bitmap buffer state in the newly allocated block */
-                debug_bh = sb_find_get_block(sb, ret_block);
-                if (debug_bh) {
-                        BUFFER_TRACE(debug_bh, "state when allocated");
-                        BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-                        brelse(debug_bh);
-                }
-        }
-        jbd_lock_bh_state(bitmap_bh);
-        spin_lock(sb_bgl_lock(sbi, group_no));
-        if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-                int i;
-                for (i = 0; i < num; i++) {
-                        if (ext4_test_bit(grp_alloc_blk+i,
-                                        bh2jh(bitmap_bh)->b_committed_data)) {
-                                printk("%s: block was unexpectedly set in "
-                                        "b_committed_data\n", __func__);
-                        }
-                }
-        }
-        ext4_debug("found bit %d\n", grp_alloc_blk);
-        spin_unlock(sb_bgl_lock(sbi, group_no));
-        jbd_unlock_bh_state(bitmap_bh);
-#endif
-        if (ret_block + num - 1 >= ext4_blocks_count(es)) {
-                ext4_error(sb, "ext4_new_block",
-                            "block(%llu) >= blocks count(%llu) - "
-                            "block_group = %lu, es == %p ", ret_block,
-                        ext4_blocks_count(es), group_no, es);
-                goto out;
-        }
-        /*
-         * It is up to the caller to add the new buffer to a journal
-         * list of some description.  We don't know in advance whether
-         * the caller wants to use it as metadata or data.
-         */
-        spin_lock(sb_bgl_lock(sbi, group_no));
-        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-        le16_add_cpu(&gdp->bg_free_blocks_count, -num);
-        gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
-        spin_unlock(sb_bgl_lock(sbi, group_no));
-        if (!EXT4_I(inode)->i_delalloc_reserved_flag)
-                percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-        if (sbi->s_log_groups_per_flex) {
-                ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
-                sbi->s_flex_groups[flex_group].free_blocks -= num;
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
-        }
-        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-        err = ext4_journal_dirty_metadata(handle, gdp_bh);
-        if (!fatal)
-                fatal = err;
-        sb->s_dirt = 1;
-        if (fatal)
-                goto out;
-        *errp = 0;
-        brelse(bitmap_bh);
-        DQUOT_FREE_BLOCK(inode, *count-num);
-        *count = num;
-        return ret_block;
-io_error:
-        *errp = -EIO;
-out:
-        if (fatal) {
-                *errp = fatal;
-                ext4_std_error(sb, fatal);
-        }
-        /*
-         * Undo the block allocation
-         */
-        if (!performed_allocation)
-                DQUOT_FREE_BLOCK(inode, *count);
-        brelse(bitmap_bh);
-        return 0;
-}
 #define EXT4_META_BLOCK 0x1
 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1960,10 +690,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
-        if (!test_opt(inode->i_sb, MBALLOC)) {
-                return ext4_old_new_blocks(handle, inode, goal, count, errp);
-        }
        memset(&ar, 0, sizeof(ar));
        /* Fill with neighbour allocated blocks */
@@ -2005,7 +731,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        /*
         * Account for the allocated meta blocks
         */
-        if (!(*errp)) {
+        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += *count;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2090,10 +816,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                bitmap_count += x;
        }
        brelse(bitmap_bh);
-        printk("ext4_count_free_blocks: stored = %llu"
+        printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
-                ", computed = %llu, %llu\n",
+                ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
-                ext4_free_blocks_count(es),
+               desc_count, bitmap_count);
-                desc_count, bitmap_count);
        return bitmap_count;
 #else
        desc_count = 0;
@@ -2180,8 +905,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
                        metagroup < first_meta_bg)
-                return ext4_bg_num_gdb_nometa(sb,group);
+                return ext4_bg_num_gdb_nometa(sb, group);
        return ext4_bg_num_gdb_meta(sb,group);
 }
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea6750454..0a7a6663c190 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
        unsigned int i;
        unsigned long sum = 0;
        if (!map)
-                return (0);
+                return 0;
        for (i = 0; i < numchars; i++)
                sum += nibblemap[map->b_data[i] & 0xf] +
                        nibblemap[(map->b_data[i] >> 4) & 0xf];
-        return (sum);
+        return sum;
 }
 #endif  /*  EXT4FS_DEBUG  */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d3d23d73c08b..3ca6a2b7632d 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
 };
 static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
+static int ext4_dx_readdir(struct file *filp,
-                           void * dirent, filldir_t filldir);
+                           void *dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
+static int ext4_release_dir(struct inode *inode,
-                                struct file * filp);
+                                struct file *filp);
 const struct file_operations ext4_dir_operations = {
        .llseek         = generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 }
-int ext4_check_dir_entry (const char * function, struct inode * dir,
+int ext4_check_dir_entry(const char *function, struct inode *dir,
-                          struct ext4_dir_entry_2 * de,
+                         struct ext4_dir_entry_2 *de,
-                          struct buffer_head * bh,
+                         struct buffer_head *bh,
-                          unsigned long offset)
+                         unsigned long offset)
 {
-        const char * error_msg = NULL;
+        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len);
        if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                ext4_error (dir->i_sb, function,
+                ext4_error(dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
        return error_msg == NULL ? 1 : 0;
 }
-static int ext4_readdir(struct file * filp,
+static int ext4_readdir(struct file *filp,
-                         void * dirent, filldir_t filldir)
+                         void *dirent, filldir_t filldir)
 {
        int error = 0;
        unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
        int err;
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret = 0;
+        int dir_has_error = 0;
        sb = inode->i_sb;
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp,
                 * of recovering data when there's a bad sector
                 */
                if (!bh) {
-                        ext4_error (sb, "ext4_readdir",
+                        if (!dir_has_error) {
-                                "directory #%lu contains a hole at offset %lu",
+                                ext4_error(sb, __func__, "directory #%lu "
-                                inode->i_ino, (unsigned long)filp->f_pos);
+                                           "contains a hole at offset %Lu",
+                                           inode->i_ino,
+                                           (unsigned long long) filp->f_pos);
+                                dir_has_error = 1;
+                        }
                        /* corrupt size?  Maybe no more blocks to read */
                        if (filp->f_pos > inode->i_blocks << 9)
                                break;
@@ -187,14 +192,14 @@ revalidate:
                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-                        if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
+                        if (!ext4_check_dir_entry("ext4_readdir", inode, de,
-                                                   bh, offset)) {
+                                                  bh, offset)) {
                                /*
                                 * On error, skip the f_pos to the next block
                                 */
                                filp->f_pos = (filp->f_pos |
                                                (sb->s_blocksize - 1)) + 1;
-                                brelse (bh);
+                                brelse(bh);
                                ret = stored;
                                goto out;
                        }
@@ -218,12 +223,12 @@ revalidate:
                                        break;
                                if (version != filp->f_version)
                                        goto revalidate;
-                                stored ++;
+                                stored++;
                        }
                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
                }
                offset = 0;
-                brelse (bh);
+                brelse(bh);
        }
 out:
        return ret;
@@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
                parent = rb_parent(n);
                fname = rb_entry(n, struct fname, rb_hash);
                while (fname) {
-                        struct fname * old = fname;
+                        struct fname *old = fname;
                        fname = fname->next;
-                        kfree (old);
+                        kfree(old);
                }
                if (!parent)
                        root->rb_node = NULL;
@@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                             struct ext4_dir_entry_2 *dirent)
 {
        struct rb_node **p, *parent = NULL;
-        struct fname * fname, *new_fn;
+        struct fname *fname, *new_fn;
        struct dir_private_info *info;
        int len;
@@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 * for all entres on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
-static int call_filldir(struct file * filp, void * dirent,
+static int call_filldir(struct file *filp, void *dirent,
                        filldir_t filldir, struct fname *fname)
 {
        struct dir_private_info *info = filp->private_data;
        loff_t  curr_pos;
        struct inode *inode = filp->f_path.dentry->d_inode;
-        struct super_block * sb;
+        struct super_block *sb;
        int error;
        sb = inode->i_sb;
        if (!fname) {
-                printk("call_filldir: called with null fname?!?\n");
+                printk(KERN_ERR "ext4: call_filldir: called with "
+                       "null fname?!?\n");
                return 0;
        }
        curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -411,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent,
                                get_dtype(sb, fname->file_type));
                if (error) {
                        filp->f_pos = curr_pos;
-                        info->extra_fname = fname->next;
+                        info->extra_fname = fname;
                        return error;
                }
                fname = fname->next;
@@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
        return 0;
 }
-static int ext4_dx_readdir(struct file * filp,
+static int ext4_dx_readdir(struct file *filp,
-                         void * dirent, filldir_t filldir)
+                         void *dirent, filldir_t filldir)
 {
        struct dir_private_info *info = filp->private_data;
        struct inode *inode = filp->f_path.dentry->d_inode;
@@ -450,11 +456,21 @@ static int ext4_dx_readdir(struct file * filp,
         * If there are any leftover names on the hash collision
         * chain, return them first.
         */
-        if (info->extra_fname &&
+        if (info->extra_fname) {
-            call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (call_filldir(filp, dirent, filldir, info->extra_fname))
-                goto finished;
+                        goto finished;
-        if (!info->curr_node)
+                info->extra_fname = NULL;
+                info->curr_node = rb_next(info->curr_node);
+                if (!info->curr_node) {
+                        if (info->next_hash == ~0) {
+                                filp->f_pos = EXT4_HTREE_EOF;
+                                goto finished;
+                        }
+                        info->curr_hash = info->next_hash;
+                        info->curr_minor_hash = 0;
+                }
+        } else if (!info->curr_node)
                info->curr_node = rb_first(&info->root);
        while (1) {
@@ -501,7 +517,7 @@ finished:
        return 0;
 }
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
 {
        if (filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6c7924d9e358..6690a41cdd9f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
 #ifdef EXT4FS_DEBUG
 #define ext4_debug(f, a...)                                             \
        do {                                                            \
-                printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",       \
+                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                  \
-                printk (KERN_DEBUG f, ## a);                            \
+                printk(KERN_DEBUG f, ## a);                             \
        } while (0)
 #else
 #define ext4_debug(f, a...)     do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
 #else
 # define EXT4_BLOCK_SIZE(s)             (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
-#define EXT4_ADDR_PER_BLOCK(s)          (EXT4_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT4_ADDR_PER_BLOCK(s)          (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
 #else
@@ -245,7 +245,7 @@ struct flex_groups {
 #define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */
 #define EXT4_FL_USER_VISIBLE            0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE         0x000380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE         0x000B80FF /* User modifiable flags */
 /*
 * Inode dynamic state flags
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
 #define EXT4_IOC_SETFLAGS               FS_IOC_SETFLAGS
 #define EXT4_IOC_GETVERSION             _IOR('f', 3, long)
 #define EXT4_IOC_SETVERSION             _IOW('f', 4, long)
-#define EXT4_IOC_GROUP_EXTEND           _IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD              _IOW('f', 8,struct ext4_new_group_input)
 #define EXT4_IOC_GETVERSION_OLD         FS_IOC_GETVERSION
 #define EXT4_IOC_SETVERSION_OLD         FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
 #endif
 #define EXT4_IOC_GETRSVSZ               _IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ               _IOW('f', 6, long)
-#define EXT4_IOC_MIGRATE                _IO('f', 7)
+#define EXT4_IOC_GROUP_EXTEND           _IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD              _IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE                _IO('f', 9)
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 /*
 * ioctl commands in 32 bit emulation
@@ -538,8 +539,9 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
-#define EXT4_MOUNT_MBALLOC              0x4000000 /* Buddy allocation support */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
@@ -667,7 +669,7 @@ struct ext4_super_block {
 };
 #ifdef __KERNEL__
-static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
 }
@@ -725,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 */
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)                        \
-        ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+        (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)                     \
-        ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+        (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)                      \
-        ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+        (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)                        \
        EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)                     \
@@ -789,6 +791,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_RESUID         0
 #define EXT4_DEF_RESGID         0
+#define EXT4_DEF_INODE_READAHEAD_BLKS   32
 /*
 * Default mount options
 */
@@ -954,6 +958,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                        unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+extern struct proc_dir_entry *ext4_proc_root;
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations ext4_ui_proc_fops;
+#define EXT4_PROC_HANDLER(name, var)                                    \
+do {                                                                    \
+        proc = proc_create_data(name, mode, sbi->s_proc,                \
+                                &ext4_ui_proc_fops, &sbi->s_##var);     \
+        if (proc == NULL) {                                             \
+                printk(KERN_ERR "EXT4-fs: can't create %s\n", name);    \
+                goto err_out;                                           \
+        }                                                               \
+} while (0)
+#else
+#define EXT4_PROC_HANDLER(name, var)
+#endif
 /*
 * Function prototypes
 */
@@ -981,23 +1003,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
                                        unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-                                                ext4_fsblk_t nblocks);
+                                         s64 nblocks);
-extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
+extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-                                 ext4_fsblk_t block, unsigned long count,
+                                ext4_fsblk_t block, unsigned long count,
                                unsigned long *pdquot_freed_blocks);
-extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
+extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
-extern void ext4_check_blocks_bitmap (struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext4_init_block_alloc_info(struct inode *);
-extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1028,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
-extern int ext4_sync_file (struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, struct dentry *, int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
                          dx_hash_info *hinfo);
 /* ialloc.c */
-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
+extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
-extern void ext4_free_inode (handle_t *, struct inode *);
+extern void ext4_free_inode(handle_t *, struct inode *);
-extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
-extern unsigned long ext4_count_free_inodes (struct super_block *);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
-extern unsigned long ext4_count_dirs (struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
-extern void ext4_check_inodes_bitmap (struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
+extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1032,7 +1051,7 @@ extern int ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1050,39 +1069,41 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                                struct buffer_head *bh_result, int create);
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                                ext4_lblk_t iblock, unsigned long maxblocks,
                                struct buffer_head *bh_result,
                                int create, int extend_disksize);
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode (struct inode *, int);
+extern int  ext4_write_inode(struct inode *, int);
-extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
                                struct kstat *stat);
-extern void ext4_delete_inode (struct inode *);
+extern void ext4_delete_inode(struct inode *);
-extern int  ext4_sync_inode (handle_t *, struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate (struct inode *);
+extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 /* migrate.c */
-extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
+extern int ext4_ext_migrate(struct inode *);
-                       unsigned long);
 /* namei.c */
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1097,14 +1118,14 @@ extern int ext4_group_extend(struct super_block *sb,
                                ext4_fsblk_t n_blocks_count);
 /* super.c */
-extern void ext4_error (struct super_block *, const char *, const char *, ...)
+extern void ext4_error(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void __ext4_std_error (struct super_block *, const char *, int);
+extern void __ext4_std_error(struct super_block *, const char *, int);
-extern void ext4_abort (struct super_block *, const char *, const char *, ...)
+extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning (struct super_block *, const char *, const char *, ...)
+extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
-extern void ext4_update_dynamic_rev (struct super_block *sb);
+extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1177,7 +1198,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
 static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
-                                                        ext4_group_t group)
+                                            ext4_group_t group)
 {
         struct ext4_group_info ***grp_info;
         long indexv, indexh;
@@ -1205,6 +1226,28 @@ do {								\
                __ext4_std_error((sb), __func__, (errno));      \
 } while (0)
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+        /*
+         * XXX: replace with spinlock if seen contended -bzzz
+         */
+        down_write(&EXT4_I(inode)->i_data_sem);
+        if (newsize > EXT4_I(inode)->i_disksize)
+                EXT4_I(inode)->i_disksize = newsize;
+        up_write(&EXT4_I(inode)->i_data_sem);
+        return ;
+}
 /*
 * Inodes and files operations
 */
@@ -1227,6 +1270,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6c166c0a54b7..bec7ce59fc0d 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
 #define EXT4_EXT_CACHE_GAP      1
 #define EXT4_EXT_CACHE_EXTENT   2
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+                                        struct ext4_ext_cache *,
+                                        struct ext4_extent *, void *);
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
 #define EXT_MAX_BLOCK   0xffffffff
@@ -216,12 +229,16 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+                                                   int num,
+                                                   struct ext4_ext_path *path);
 extern int ext4_ext_try_to_merge(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
+                                                        ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
                                                        struct ext4_ext_path *);
 extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e4..5c124c0ac6d3 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned long ext4_group_t;
-struct ext4_reserve_window {
-        ext4_fsblk_t    _rsv_start;     /* First byte reserved */
-        ext4_fsblk_t    _rsv_end;       /* Last byte reserved or 0 */
-};
-struct ext4_reserve_window_node {
-        struct rb_node          rsv_node;
-        __u32                   rsv_goal_size;
-        __u32                   rsv_alloc_hit;
-        struct ext4_reserve_window      rsv_window;
-};
-struct ext4_block_alloc_info {
-        /* information about reservation window */
-        struct ext4_reserve_window_node rsv_window_node;
-        /*
-         * was i_next_alloc_block in ext4_inode_info
-         * is the logical (file-relative) number of the
-         * most-recently-allocated block in this file.
-         * We use this for detecting linearly ascending allocation requests.
-         */
-        ext4_lblk_t last_alloc_logical_block;
-        /*
-         * Was i_next_alloc_goal in ext4_inode_info
-         * is the *physical* companion to i_next_alloc_block.
-         * it the physical block number of the block which was most-recentl
-         * allocated to this file.  This give us the goal (target) for the next
-         * allocation when we detect linearly ascending requests.
-         */
-        ext4_fsblk_t last_alloc_physical_block;
-};
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
@@ -97,11 +65,8 @@ struct ext4_inode_info {
        ext4_group_t    i_block_group;
        __u32   i_state;                /* Dynamic state flags for ext4 */
-        /* block reservation info */
-        struct ext4_block_alloc_info *i_block_alloc_info;
        ext4_lblk_t             i_dir_start_lookup;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
         */
        struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
 #endif
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index eb8bc3afe6e9..b455c685a98b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)      (EXT4_XATTR_TRANS_BLOCKS + \
+                                        2*EXT4_QUOTA_TRANS_BLOCKS(sb))
 /* Delete operations potentially hit one directory's namespace plus an
 * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
 * generous.  We can grow the delete transaction later if necessary. */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d5531..6a0b40d43264 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
        unsigned long s_blocks_last;    /* Last seen block count */
        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
        struct buffer_head * s_sbh;     /* Buffer containing the super block */
-        struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */
+        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
-        struct buffer_head ** s_group_desc;
+        struct buffer_head **s_group_desc;
        unsigned long  s_mount_opt;
        ext4_fsblk_t s_sb_block;
        uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
+        unsigned int s_inode_readahead_blks;
        spinlock_t s_next_gen_lock;
        u32 s_next_generation;
        u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
+        struct percpu_counter s_dirtyblocks_counter;
        struct blockgroup_lock s_blockgroup_lock;
+        struct proc_dir_entry *s_proc;
        /* root of the per fs reservation window tree */
        spinlock_t s_rsv_window_lock;
        struct rb_root s_rsv_window_root;
-        struct ext4_reserve_window_node s_rsv_window_head;
        /* Journaling */
-        struct inode * s_journal_inode;
+        struct inode *s_journal_inode;
-        struct journal_s * s_journal;
+        struct journal_s *s_journal;
        struct list_head s_orphan;
        unsigned long s_commit_interval;
        struct block_device *journal_bdev;
@@ -106,12 +108,12 @@ struct ext4_sb_info {
        /* tunables */
        unsigned long s_stripe;
-        unsigned long s_mb_stream_request;
+        unsigned int s_mb_stream_request;
-        unsigned long s_mb_max_to_scan;
+        unsigned int s_mb_max_to_scan;
-        unsigned long s_mb_min_to_scan;
+        unsigned int s_mb_min_to_scan;
-        unsigned long s_mb_stats;
+        unsigned int s_mb_stats;
-        unsigned long s_mb_order2_reqs;
+        unsigned int s_mb_order2_reqs;
-        unsigned long s_mb_group_prealloc;
+        unsigned int s_mb_group_prealloc;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
@@ -121,7 +123,6 @@ struct ext4_sb_info {
        int s_mb_history_cur;
        int s_mb_history_max;
        int s_mb_history_num;
-        struct proc_dir_entry *s_mb_proc;
        spinlock_t s_mb_history_lock;
        int s_mb_history_filter;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 612c3d2c3824..ea2ce3c0ae66 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/falloc.h>
 #include <asm/uaccess.h>
+#include <linux/fiemap.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
@@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        ext_debug("\n");
 }
 #else
-#define ext4_ext_show_path(inode,path)
+#define ext4_ext_show_path(inode, path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_leaf(inode, path)
 #endif
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
                  if (k != 0 &&
                      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
-                                printk("k=%d, ix=0x%p, first=0x%p\n", k,
+                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
-                                        ix, EXT_FIRST_INDEX(eh));
+                                       "first=0x%p\n", k,
-                                printk("%u <= %u\n",
+                                       ix, EXT_FIRST_INDEX(eh));
+                                printk(KERN_DEBUG "%u <= %u\n",
                                       le32_to_cpu(ix->ei_block),
                                       le32_to_cpu(ix[-1].ei_block));
                        }
@@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path,
                                struct ext4_extent *newext)
 {
-        struct ext4_extent_header * eh;
+        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
        struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@ cleanup:
        return err;
 }
+int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+                        ext4_lblk_t num, ext_prepare_callback func,
+                        void *cbdata)
+{
+        struct ext4_ext_path *path = NULL;
+        struct ext4_ext_cache cbex;
+        struct ext4_extent *ex;
+        ext4_lblk_t next, start = 0, end = 0;
+        ext4_lblk_t last = block + num;
+        int depth, exists, err = 0;
+        BUG_ON(func == NULL);
+        BUG_ON(inode == NULL);
+        while (block < last && block != EXT_MAX_BLOCK) {
+                num = last - block;
+                /* find extent for this block */
+                path = ext4_ext_find_extent(inode, block, path);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        path = NULL;
+                        break;
+                }
+                depth = ext_depth(inode);
+                BUG_ON(path[depth].p_hdr == NULL);
+                ex = path[depth].p_ext;
+                next = ext4_ext_next_allocated_block(path);
+                exists = 0;
+                if (!ex) {
+                        /* there is no extent yet, so try to allocate
+                         * all requested space */
+                        start = block;
+                        end = block + num;
+                } else if (le32_to_cpu(ex->ee_block) > block) {
+                        /* need to allocate space before found extent */
+                        start = block;
+                        end = le32_to_cpu(ex->ee_block);
+                        if (block + num < end)
+                                end = block + num;
+                } else if (block >= le32_to_cpu(ex->ee_block)
+                                        + ext4_ext_get_actual_len(ex)) {
+                        /* need to allocate space after found extent */
+                        start = block;
+                        end = block + num;
+                        if (end >= next)
+                                end = next;
+                } else if (block >= le32_to_cpu(ex->ee_block)) {
+                        /*
+                         * some part of requested space is covered
+                         * by found extent
+                         */
+                        start = block;
+                        end = le32_to_cpu(ex->ee_block)
+                                + ext4_ext_get_actual_len(ex);
+                        if (block + num < end)
+                                end = block + num;
+                        exists = 1;
+                } else {
+                        BUG();
+                }
+                BUG_ON(end <= start);
+                if (!exists) {
+                        cbex.ec_block = start;
+                        cbex.ec_len = end - start;
+                        cbex.ec_start = 0;
+                        cbex.ec_type = EXT4_EXT_CACHE_GAP;
+                } else {
+                        cbex.ec_block = le32_to_cpu(ex->ee_block);
+                        cbex.ec_len = ext4_ext_get_actual_len(ex);
+                        cbex.ec_start = ext_pblock(ex);
+                        cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+                }
+                BUG_ON(cbex.ec_len == 0);
+                err = func(inode, path, &cbex, ex, cbdata);
+                ext4_ext_drop_refs(path);
+                if (err < 0)
+                        break;
+                if (err == EXT_REPEAT)
+                        continue;
+                else if (err == EXT_BREAK) {
+                        err = 0;
+                        break;
+                }
+                if (ext_depth(inode) != depth) {
+                        /* depth was changed. we have to realloc path */
+                        kfree(path);
+                        path = NULL;
+                }
+                block = cbex.ec_block + cbex.ec_len;
+        }
+        if (path) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+        }
+        return err;
+}
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
                        __u32 len, ext4_fsblk_t start, int type)
@@ -1747,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 }
 /*
- * ext4_ext_calc_credits_for_insert:
+ * ext4_ext_calc_credits_for_single_extent:
- * This routine returns max. credits that the extent tree can consume.
+ * This routine returns max. credits that needed to insert an extent
- * It should be OK for low-performance paths like ->writepage()
+ * to the extent tree.
- * To allow many writing processes to fit into a single transaction,
+ * When pass the actual path, the caller should calculate credits
- * the caller should calculate credits under i_data_sem and
+ * under i_data_sem.
- * pass the actual path.
 */
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                struct ext4_ext_path *path)
 {
-        int depth, needed;
        if (path) {
+                int depth = ext_depth(inode);
+                int ret = 0;
                /* probably there is space in leaf? */
-                depth = ext_depth(inode);
                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
-                                < le16_to_cpu(path[depth].p_hdr->eh_max))
+                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
-                        return 1;
-        }
-        /*
-         * given 32-bit logical block (4294967296 blocks), max. tree
-         * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
-         * Let's also add one more level for imbalance.
-         */
-        depth = 5;
-        /* allocation of new data block(s) */
+                        /*
-        needed = 2;
+                         *  There are some space in the leaf tree, no
+                         *  need to account for leaf block credit
+                         *
+                         *  bitmaps and block group descriptor blocks
+                         *  and other metadat blocks still need to be
+                         *  accounted.
+                         */
+                        /* 1 bitmap, 1 block group descriptor */
+                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+                }
+        }
-        /*
+        return ext4_chunk_trans_blocks(inode, nrblocks);
-         * tree can be full, so it would need to grow in depth:
+}
-         * we need one credit to modify old root, credits for
-         * new root will be added in split accounting
-         */
-        needed += 1;
-        /*
+/*
-         * Index split can happen, we would need:
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
-         *    allocate intermediate indexes (bitmap + group)
+ *
-         *  + change two blocks at each level, but root (already included)
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
-         */
+ * in the worse case, each tree level index/leaf need to be changed
-        needed += (depth * 2) + (depth * 2);
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int index;
+        int depth = ext_depth(inode);
-        /* any allocation modifies superblock */
+        if (chunk)
-        needed += 1;
+                index = depth * 2;
+        else
+                index = depth * 3;
-        return needed;
+        return index;
 }
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1921,9 +2037,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
-#ifdef CONFIG_QUOTA
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
                err = ext4_ext_journal_restart(handle, credits);
                if (err)
@@ -2137,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
         */
        if (test_opt(sb, EXTENTS)) {
-                printk("EXT4-fs: file extents enabled");
+                printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
                printk(", aggressive tests");
 #endif
@@ -2691,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                goto out2;
        }
        /*
-         * Okay, we need to do block allocation.  Lazily initialize the block
+         * Okay, we need to do block allocation.
-         * allocation info here if necessary.
         */
-        if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
-                ext4_init_block_alloc_info(inode);
        /* find neighbour allocated blocks */
        ar.lleft = iblock;
@@ -2755,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                /* free data blocks we just allocated */
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
-                ext4_mb_discard_inode_preallocations(inode);
+                ext4_discard_preallocations(inode);
                ext4_free_blocks(handle, inode, ext_pblock(&newex),
                                        ext4_ext_get_actual_len(&newex), 0);
                goto out2;
@@ -2805,7 +2916,7 @@ void ext4_ext_truncate(struct inode *inode)
        /*
         * probably first extent we're gonna free will be last in block
         */
-        err = ext4_writepage_trans_blocks(inode) + 3;
+        err = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, err);
        if (IS_ERR(handle))
                return;
@@ -2819,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode)
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
-        ext4_mb_discard_inode_preallocations(inode);
+        ext4_discard_preallocations(inode);
        /*
         * TODO: optimization is possible here.
@@ -2858,27 +2969,6 @@ out_stop:
        ext4_journal_stop(handle);
 }
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
-{
-        int needed;
-        needed = ext4_ext_calc_credits_for_insert(inode, NULL);
-        /* caller wants to allocate num blocks, but note it includes sb */
-        needed = needed * num - (num - 1);
-#ifdef CONFIG_QUOTA
-        needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-        return needed;
-}
 static void ext4_falloc_update_inode(struct inode *inode,
                                int mode, loff_t new_size, int update_ctime)
 {
@@ -2893,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
         * Update only when preallocation was requested beyond
         * the file size.
         */
-        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
-                                new_size > i_size_read(inode)) {
+                if (new_size > i_size_read(inode))
-                i_size_write(inode, new_size);
+                        i_size_write(inode, new_size);
-                EXT4_I(inode)->i_disksize = new_size;
+                if (new_size > EXT4_I(inode)->i_disksize)
+                        ext4_update_i_disksize(inode, new_size);
        }
 }
@@ -2939,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
                                                        - block;
        /*
-         * credits to insert 1 extent into extent tree + buffers to be able to
+         * credits to insert 1 extent into extent tree
-         * modify 1 super block, 1 block bitmap and 1 group descriptor.
         */
-        credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
 retry:
        while (ret >= 0 && ret < max_blocks) {
@@ -2989,3 +3079,143 @@ retry:
        mutex_unlock(&inode->i_mutex);
        return ret > 0 ? ret2 : ret;
 }
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+                       void *data)
+{
+        struct fiemap_extent_info *fieinfo = data;
+        unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+        __u64   logical;
+        __u64   physical;
+        __u64   length;
+        __u32   flags = 0;
+        int     error;
+        logical =  (__u64)newex->ec_block << blksize_bits;
+        if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+                pgoff_t offset;
+                struct page *page;
+                struct buffer_head *bh = NULL;
+                offset = logical >> PAGE_SHIFT;
+                page = find_get_page(inode->i_mapping, offset);
+                if (!page || !page_has_buffers(page))
+                        return EXT_CONTINUE;
+                bh = page_buffers(page);
+                if (!bh)
+                        return EXT_CONTINUE;
+                if (buffer_delay(bh)) {
+                        flags |= FIEMAP_EXTENT_DELALLOC;
+                        page_cache_release(page);
+                } else {
+                        page_cache_release(page);
+                        return EXT_CONTINUE;
+                }
+        }
+        physical = (__u64)newex->ec_start << blksize_bits;
+        length =   (__u64)newex->ec_len << blksize_bits;
+        if (ex && ext4_ext_is_uninitialized(ex))
+                flags |= FIEMAP_EXTENT_UNWRITTEN;
+        /*
+         * If this extent reaches EXT_MAX_BLOCK, it must be last.
+         *
+         * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
+         * this also indicates no more allocated blocks.
+         *
+         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
+         */
+        if (logical + length - 1 == EXT_MAX_BLOCK ||
+            ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+                flags |= FIEMAP_EXTENT_LAST;
+        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+                                        length, flags);
+        if (error < 0)
+                return error;
+        if (error == 1)
+                return EXT_BREAK;
+        return EXT_CONTINUE;
+}
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+{
+        __u64 physical = 0;
+        __u64 length;
+        __u32 flags = FIEMAP_EXTENT_LAST;
+        int blockbits = inode->i_sb->s_blocksize_bits;
+        int error = 0;
+        /* in-inode? */
+        if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+                struct ext4_iloc iloc;
+                int offset;     /* offset of xattr in inode */
+                error = ext4_get_inode_loc(inode, &iloc);
+                if (error)
+                        return error;
+                physical = iloc.bh->b_blocknr << blockbits;
+                offset = EXT4_GOOD_OLD_INODE_SIZE +
+                                EXT4_I(inode)->i_extra_isize;
+                physical += offset;
+                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+                flags |= FIEMAP_EXTENT_DATA_INLINE;
+        } else { /* external block */
+                physical = EXT4_I(inode)->i_file_acl << blockbits;
+                length = inode->i_sb->s_blocksize;
+        }
+        if (physical)
+                error = fiemap_fill_next_extent(fieinfo, 0, physical,
+                                                length, flags);
+        return (error < 0 ? error : 0);
+}
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                __u64 start, __u64 len)
+{
+        ext4_lblk_t start_blk;
+        ext4_lblk_t len_blks;
+        int error = 0;
+        /* fallback to generic here if not in extents fmt */
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                return generic_block_fiemap(inode, fieinfo, start, len,
+                        ext4_get_block);
+        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+                return -EBADR;
+        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+                error = ext4_xattr_fiemap(inode, fieinfo);
+        } else {
+                start_blk = start >> inode->i_sb->s_blocksize_bits;
+                len_blks = len >> inode->i_sb->s_blocksize_bits;
+                /*
+                 * Walk the extent tree gathering extent information.
+                 * ext4_ext_fiemap_cb will push extents back to user.
+                 */
+                down_write(&EXT4_I(inode)->i_data_sem);
+                error = ext4_ext_walk_space(inode, start_blk, len_blks,
+                                          ext4_ext_fiemap_cb, fieinfo);
+                up_write(&EXT4_I(inode)->i_data_sem);
+        }
+        return error;
+}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db4..6bd11fba71f7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
 * from ext4_file_open: open gets called at every open, but release
 * gets called only when /all/ the files are closed.
 */
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
 {
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1))
        {
                down_write(&EXT4_I(inode)->i_data_sem);
-                ext4_discard_reservation(inode);
+                ext4_discard_preallocations(inode);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                __u64 start, __u64 len);
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
        .truncate       = ext4_truncate,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
 #endif
        .permission     = ext4_permission,
        .fallocate      = ext4_fallocate,
+        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad31..5afe4370840b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
@@ -43,7 +44,7 @@
 * inode to disk.
 */
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
+        trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+                   inode->i_sb->s_id, datasync, inode->i_ino,
+                   dentry->d_parent->d_inode->i_ino);
        /*
         * data=writeback:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe390..556ca8eba3db 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
                sum += DELTA;
                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-        } while(--n);
+        } while (--n);
        buf[0] += b0;
        buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash(const char *name, int len)
 {
        __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
        while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        val = pad;
        if (len > num*4)
                len = num * 4;
-        for (i=0; i < len; i++) {
+        for (i = 0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
                val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        /* Check to see if the seed is all zero's */
        if (hinfo->seed) {
-                for (i=0; i < 4; i++) {
+                for (i = 0; i < 4; i++) {
                        if (hinfo->seed[i])
                                break;
                }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 655e760212b8..fe34d74cfb19 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                            block_group, bitmap_blk);
                return NULL;
        }
-        if (bh_uptodate_or_lock(bh))
+        if (buffer_uptodate(bh) &&
+            !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
                return bh;
+        lock_buffer(bh);
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@ -154,39 +156,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 * though), and then we'd have two inodes sharing the
 * same inode number and space on the harddisk.
 */
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
 {
-        struct super_block * sb = inode->i_sb;
+        struct super_block *sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
        ext4_group_t block_group;
        unsigned long bit;
-        struct ext4_group_desc * gdp;
+        struct ext4_group_desc *gdp;
-        struct ext4_super_block * es;
+        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err;
        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
-                printk ("ext4_free_inode: inode has count=%d\n",
+                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
-                                        atomic_read(&inode->i_count));
+                       atomic_read(&inode->i_count));
                return;
        }
        if (inode->i_nlink) {
-                printk ("ext4_free_inode: inode has nlink=%d\n",
+                printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
-                        inode->i_nlink);
+                       inode->i_nlink);
                return;
        }
        if (!sb) {
-                printk("ext4_free_inode: inode on nonexistent device\n");
+                printk(KERN_ERR "ext4_free_inode: inode on "
+                       "nonexistent device\n");
                return;
        }
        sbi = EXT4_SB(sb);
        ino = inode->i_ino;
-        ext4_debug ("freeing inode %lu\n", ino);
+        ext4_debug("freeing inode %lu\n", ino);
        /*
         * Note: we must free any quota before locking the superblock,
@@ -200,12 +203,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        is_directory = S_ISDIR(inode->i_mode);
        /* Do this BEFORE marking the inode not in use or returning an error */
-        clear_inode (inode);
+        clear_inode(inode);
        es = EXT4_SB(sb)->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-                ext4_error (sb, "ext4_free_inode",
+                ext4_error(sb, "ext4_free_inode",
-                            "reserved or nonexistent inode %lu", ino);
+                           "reserved or nonexistent inode %lu", ino);
                goto error_return;
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -222,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        /* Ok, now we can actually update the inode bitmaps.. */
        if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
                                        bit, bitmap_bh->b_data))
-                ext4_error (sb, "ext4_free_inode",
+                ext4_error(sb, "ext4_free_inode",
-                              "bit already cleared for inode %lu", ino);
+                           "bit already cleared for inode %lu", ino);
        else {
-                gdp = ext4_get_group_desc (sb, block_group, &bh2);
+                gdp = ext4_get_group_desc(sb, block_group, &bh2);
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, bh2);
@@ -287,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
        avefreei = freei / ngroups;
        for (group = 0; group < ngroups; group++) {
-                desc = ext4_get_group_desc (sb, group, NULL);
+                desc = ext4_get_group_desc(sb, group, NULL);
                if (!desc || !desc->bg_free_inodes_count)
                        continue;
                if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -351,7 +354,7 @@ find_close_to_parent:
                        goto found_flexbg;
                }
-                if (best_flex < 0 ||
+                if (flex_group[best_flex].free_inodes == 0 ||
                    (flex_group[i].free_blocks >
                     flex_group[best_flex].free_blocks &&
                     flex_group[i].free_inodes))
@@ -576,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
        ext4_group_t group = 0;
        unsigned long ino = 0;
-        struct inode * inode;
+        struct inode *inode;
-        struct ext4_group_desc * gdp = NULL;
+        struct ext4_group_desc *gdp = NULL;
-        struct ext4_super_block * es;
+        struct ext4_super_block *es;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err = 0;
@@ -613,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
        }
        if (S_ISDIR(mode)) {
-                if (test_opt (sb, OLDALLOC))
+                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
                        ret2 = find_group_orlov(sb, dir, &group);
@@ -783,7 +786,7 @@ got:
        }
        inode->i_uid = current->fsuid;
-        if (test_opt (sb, GRPID))
+        if (test_opt(sb, GRPID))
                inode->i_gid = dir->i_gid;
        else if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
@@ -816,7 +819,6 @@ got:
                ei->i_flags &= ~EXT4_DIRSYNC_FL;
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
-        ei->i_block_alloc_info = NULL;
        ei->i_block_group = group;
        ext4_set_inode_flags(inode);
@@ -832,7 +834,7 @@ got:
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
        ret = inode;
-        if(DQUOT_ALLOC_INODE(inode)) {
+        if (DQUOT_ALLOC_INODE(inode)) {
                err = -EDQUOT;
                goto fail_drop;
        }
@@ -841,7 +843,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext4_init_security(handle,inode, dir);
+        err = ext4_init_security(handle, inode, dir);
        if (err)
                goto fail_free_drop;
@@ -959,7 +961,7 @@ error:
        return ERR_PTR(err);
 }
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
@@ -974,7 +976,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
        bitmap_count = 0;
        gdp = NULL;
        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-                gdp = ext4_get_group_desc (sb, i, NULL);
+                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -989,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
                bitmap_count += x;
        }
        brelse(bitmap_bh);
-        printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
+        printk(KERN_DEBUG "ext4_count_free_inodes: "
-                le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+               "stored = %u, computed = %lu, %lu\n",
+               le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
        return desc_count;
 #else
        desc_count = 0;
        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-                gdp = ext4_get_group_desc (sb, i, NULL);
+                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -1006,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 }
 /* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
 {
        unsigned long count = 0;
        ext4_group_t i;
        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-                struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59fbbe899acc..9b4ec9decfd1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
+#define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
@@ -188,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 /*
 * Called at the last iput() if i_nlink is zero.
 */
-void ext4_delete_inode (struct inode * inode)
+void ext4_delete_inode(struct inode *inode)
 {
        handle_t *handle;
        int err;
@@ -328,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
        int final = 0;
        if (i_block < 0) {
-                ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+                ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
        } else if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
-        } else if ( (i_block -= direct_blocks) < indirect_blocks) {
+        } else if ((i_block -= direct_blocks) < indirect_blocks) {
                offsets[n++] = EXT4_IND_BLOCK;
                offsets[n++] = i_block;
                final = ptrs;
@@ -398,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
        *err = 0;
        /* i_data is not going away, no lock needed */
-        add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+        add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
        if (!p->key)
                goto no_block;
        while (--depth) {
                bh = sb_bread(sb, le32_to_cpu(p->key));
                if (!bh)
                        goto failure;
-                add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
                        goto no_block;
@@ -441,7 +443,7 @@ no_block:
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
-        __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+        __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
        __le32 *p;
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
@@ -484,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                Indirect *partial)
 {
-        struct ext4_block_alloc_info *block_i;
-        block_i =  EXT4_I(inode)->i_block_alloc_info;
        /*
-         * try the heuristic for sequential allocation,
+         * XXX need to get goal block from mballoc's data structures
-         * failing that at least try to get decent locality.
         */
-        if (block_i && (block == block_i->last_alloc_logical_block + 1)
-                && (block_i->last_alloc_physical_block != 0)) {
-                return block_i->last_alloc_physical_block + 1;
-        }
        return ext4_find_near(inode, partial);
 }
@@ -628,7 +621,7 @@ allocated:
        *err = 0;
        return ret;
 failed_out:
-        for (i = 0; i <index; i++)
+        for (i = 0; i < index; i++)
                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
        return ret;
 }
@@ -701,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                branch[n].p = (__le32 *) bh->b_data + offsets[n];
                branch[n].key = cpu_to_le32(new_blocks[n]);
                *branch[n].p = branch[n].key;
-                if ( n == indirect_blks) {
+                if (n == indirect_blks) {
                        current_block = new_blocks[n];
                        /*
                         * End of chain, update the last new metablock of
@@ -728,7 +721,7 @@ failed:
                BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
                ext4_journal_forget(handle, branch[i].bh);
        }
-        for (i = 0; i <indirect_blks; i++)
+        for (i = 0; i < indirect_blks; i++)
                ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
        ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -755,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 {
        int i;
        int err = 0;
-        struct ext4_block_alloc_info *block_i;
        ext4_fsblk_t current_block;
-        block_i = EXT4_I(inode)->i_block_alloc_info;
        /*
         * If we're splicing into a [td]indirect block (as opposed to the
         * inode) then we need to get write access to the [td]indirect block
@@ -781,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
        if (num == 0 && blks > 1) {
                current_block = le32_to_cpu(where->key) + 1;
                for (i = 1; i < blks; i++)
-                        *(where->p + i ) = cpu_to_le32(current_block++);
+                        *(where->p + i) = cpu_to_le32(current_block++);
-        }
-        /*
-         * update the most recently allocated logical & physical block
-         * in i_block_alloc_info, to assist find the proper goal block for next
-         * allocation
-         */
-        if (block_i) {
-                block_i->last_alloc_logical_block = block + blks - 1;
-                block_i->last_alloc_physical_block =
-                                le32_to_cpu(where[num].key) + blks - 1;
        }
        /* We are done with atomic stuff, now do the rest of housekeeping */
@@ -912,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
                goto cleanup;
        /*
-         * Okay, we need to do block allocation.  Lazily initialize the block
+         * Okay, we need to do block allocation.
-         * allocation info here if necessary
        */
-        if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
-                ext4_init_block_alloc_info(inode);
        goal = ext4_find_goal(inode, iblock, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -1005,6 +981,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
 */
 static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
 {
+        if (!blocks)
+                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
                return ext4_ext_calc_metadata_amount(inode, blocks);
@@ -1025,34 +1004,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-        /* Account for allocated meta_blocks */
+        if (mdb_free) {
-        mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+                /* Account for allocated meta_blocks */
+                mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-        /* update fs free blocks counter for truncate case */
+                /* update fs dirty blocks counter */
-        percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+                EXT4_I(inode)->i_allocated_meta_blocks = 0;
+                EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+        }
        /* update per-inode reservations */
        BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
        EXT4_I(inode)->i_reserved_data_blocks -= used;
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-        EXT4_I(inode)->i_allocated_meta_blocks = 0;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
-/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
 /*
 * The ext4_get_blocks_wrap() function try to look up the requested blocks,
 * and returns if the blocks are already mapped.
@@ -1164,19 +1132,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        return retval;
 }
-static int ext4_get_block(struct inode *inode, sector_t iblock,
+/* Maximum number of blocks we map for direct IO at once. */
-                        struct buffer_head *bh_result, int create)
+#define DIO_MAX_BLOCKS 4096
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create)
 {
        handle_t *handle = ext4_journal_current_handle();
        int ret = 0, started = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int dio_credits;
        if (create && !handle) {
                /* Direct IO write... */
                if (max_blocks > DIO_MAX_BLOCKS)
                        max_blocks = DIO_MAX_BLOCKS;
-                handle = ext4_journal_start(inode, DIO_CREDITS +
+                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                              2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
@@ -1244,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                        BUFFER_TRACE(bh, "call get_create_access");
                        fatal = ext4_journal_get_create_access(handle, bh);
                        if (!fatal && !buffer_uptodate(bh)) {
-                                memset(bh->b_data,0,inode->i_sb->s_blocksize);
+                                memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                                set_buffer_uptodate(bh);
                        }
                        unlock_buffer(bh);
@@ -1269,7 +1241,7 @@ err:
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                               ext4_lblk_t block, int create, int *err)
 {
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        bh = ext4_getblk(handle, inode, block, create, err);
        if (!bh)
@@ -1285,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
        return NULL;
 }
-static int walk_page_buffers(   handle_t *handle,
+static int walk_page_buffers(handle_t *handle,
-                                struct buffer_head *head,
+                             struct buffer_head *head,
-                                unsigned from,
+                             unsigned from,
-                                unsigned to,
+                             unsigned to,
-                                int *partial,
+                             int *partial,
-                                int (*fn)(      handle_t *handle,
+                             int (*fn)(handle_t *handle,
-                                                struct buffer_head *bh))
+                                       struct buffer_head *bh))
 {
        struct buffer_head *bh;
        unsigned block_start, block_end;
@@ -1299,9 +1271,9 @@ static int walk_page_buffers(	handle_t *handle,
        int err, ret = 0;
        struct buffer_head *next;
-        for (   bh = head, block_start = 0;
+        for (bh = head, block_start = 0;
-                ret == 0 && (bh != head || !block_start);
+             ret == 0 && (bh != head || !block_start);
-                block_start = block_end, bh = next)
+             block_start = block_end, bh = next)
        {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
@@ -1354,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
 {
-        struct inode *inode = mapping->host;
+        struct inode *inode = mapping->host;
        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
        handle_t *handle;
        int retries = 0;
-        struct page *page;
+        struct page *page;
        pgoff_t index;
-        unsigned from, to;
+        unsigned from, to;
        index = pos >> PAGE_CACHE_SHIFT;
-        from = pos & (PAGE_CACHE_SIZE - 1);
+        from = pos & (PAGE_CACHE_SIZE - 1);
-        to = from + len;
+        to = from + len;
 retry:
-        handle = ext4_journal_start(inode, needed_blocks);
+        handle = ext4_journal_start(inode, needed_blocks);
-        if (IS_ERR(handle)) {
+        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
+                ret = PTR_ERR(handle);
-                goto out;
+                goto out;
        }
        page = __grab_cache_page(mapping, index);
@@ -1390,9 +1362,16 @@ retry:
        }
        if (ret) {
-                unlock_page(page);
+                unlock_page(page);
                ext4_journal_stop(handle);
-                page_cache_release(page);
+                page_cache_release(page);
+                /*
+                 * block_write_begin may have instantiated a few blocks
+                 * outside i_size.  Trim these off again. Don't need
+                 * i_size_read because we hold i_mutex.
+                 */
+                if (pos + len > inode->i_size)
+                        vmtruncate(inode, inode->i_size);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1429,16 +1408,18 @@ static int ext4_ordered_write_end(struct file *file,
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
-                /*
-                 * generic_write_end() will run mark_inode_dirty() if i_size
-                 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-                 * into that.
-                 */
                loff_t new_i_size;
                new_i_size = pos + copied;
-                if (new_i_size > EXT4_I(inode)->i_disksize)
+                if (new_i_size > EXT4_I(inode)->i_disksize) {
-                        EXT4_I(inode)->i_disksize = new_i_size;
+                        ext4_update_i_disksize(inode, new_i_size);
+                        /* We need to mark inode dirty even if
+                         * new_i_size is less that inode->i_size
+                         * bu greater than i_disksize.(hint delalloc)
+                         */
+                        ext4_mark_inode_dirty(handle, inode);
+                }
                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
@@ -1463,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file,
        loff_t new_i_size;
        new_i_size = pos + copied;
-        if (new_i_size > EXT4_I(inode)->i_disksize)
+        if (new_i_size > EXT4_I(inode)->i_disksize) {
-                EXT4_I(inode)->i_disksize = new_i_size;
+                ext4_update_i_disksize(inode, new_i_size);
+                /* We need to mark inode dirty even if
+                 * new_i_size is less that inode->i_size
+                 * bu greater than i_disksize.(hint delalloc)
+                 */
+                ext4_mark_inode_dirty(handle, inode);
+        }
        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
@@ -1489,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
+        loff_t new_i_size;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1503,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file,
                                to, &partial, write_end_fn);
        if (!partial)
                SetPageUptodate(page);
-        if (pos+copied > inode->i_size)
+        new_i_size = pos + copied;
+        if (new_i_size > inode->i_size)
                i_size_write(inode, pos+copied);
        EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-        if (inode->i_size > EXT4_I(inode)->i_disksize) {
+        if (new_i_size > EXT4_I(inode)->i_disksize) {
-                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_update_i_disksize(inode, new_i_size);
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
@@ -1524,6 +1513,7 @@ static int ext4_journalled_write_end(struct file *file,
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
+        int retries = 0;
       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
       unsigned long md_needed, mdblocks, total = 0;
@@ -1532,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
         * in order to allocate nrblocks
         * worse case is one extent per block
         */
+repeat:
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
        mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1540,13 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
        total = md_needed + nrblocks;
-        if (ext4_has_free_blocks(sbi, total) < total) {
+        if (ext4_claim_free_blocks(sbi, total)) {
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+                        yield();
+                        goto repeat;
+                }
                return -ENOSPC;
        }
-        /* reduce fs free blocks counter */
-        percpu_counter_sub(&sbi->s_freeblocks_counter, total);
        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
@@ -1559,7 +1551,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int total, mdb, mdb_free, release;
+        if (!to_free)
+                return;         /* Nothing to release, exit */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        if (!EXT4_I(inode)->i_reserved_data_blocks) {
+                /*
+                 * if there is no reserved blocks, but we try to free some
+                 * then the counter is messed up somewhere.
+                 * but since this function is called from invalidate
+                 * page, it's harmless to return without any action
+                 */
+                printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+                            "blocks for inode %lu, but there is no reserved "
+                            "data blocks\n", to_free, inode->i_ino);
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                return;
+        }
        /* recalculate the number of metablocks still need to be reserved */
        total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
        mdb = ext4_calc_metadata_amount(inode, total);
@@ -1570,8 +1580,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        release = to_free + mdb_free;
-        /* update fs free blocks counter for truncate case */
+        /* update fs dirty blocks counter for truncate case */
-        percpu_counter_add(&sbi->s_freeblocks_counter, release);
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
        /* update per-inode reservations */
        BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1613,11 +1623,14 @@ struct mpage_da_data {
        unsigned long first_page, next_page;    /* extent of pages */
        get_block_t *get_block;
        struct writeback_control *wbc;
+        int io_done;
+        long pages_written;
+        int retval;
 };
 /*
 * mpage_da_submit_io - walks through extent of pages and try to write
- * them with __mpage_writepage()
+ * them with writepage() call back
 *
 * @mpd->inode: inode
 * @mpd->first_page: first page of the extent
@@ -1632,18 +1645,11 @@ struct mpage_da_data {
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
        struct address_space *mapping = mpd->inode->i_mapping;
-        struct mpage_data mpd_pp = {
-                .bio = NULL,
-                .last_block_in_bio = 0,
-                .get_block = mpd->get_block,
-                .use_writepage = 1,
-        };
        int ret = 0, err, nr_pages, i;
        unsigned long index, end;
        struct pagevec pvec;
        BUG_ON(mpd->next_page <= mpd->first_page);
        pagevec_init(&pvec, 0);
        index = mpd->first_page;
        end = mpd->next_page - 1;
@@ -1661,8 +1667,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                                break;
                        index++;
-                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        err = mapping->a_ops->writepage(page, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
                        /*
                         * In error case, we have to continue because
                         * remaining pages are still locked
@@ -1673,9 +1680,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                }
                pagevec_release(&pvec);
        }
-        if (mpd_pp.bio)
-                mpage_bio_submit(WRITE, mpd_pp.bio);
        return ret;
 }
@@ -1698,7 +1702,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
        int blocks = exbh->b_size >> inode->i_blkbits;
        sector_t pblock = exbh->b_blocknr, cur_logical;
        struct buffer_head *head, *bh;
-        unsigned long index, end;
+        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
@@ -1741,6 +1745,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                if (buffer_delay(bh)) {
                                        bh->b_blocknr = pblock;
                                        clear_buffer_delay(bh);
+                                        bh->b_bdev = inode->i_sb->s_bdev;
+                                } else if (buffer_unwritten(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_unwritten(bh);
+                                        set_buffer_mapped(bh);
+                                        set_buffer_new(bh);
+                                        bh->b_bdev = inode->i_sb->s_bdev;
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
@@ -1768,6 +1779,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
 }
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+                                        sector_t logical, long blk_cnt)
+{
+        int nr_pages, i;
+        pgoff_t index, end;
+        struct pagevec pvec;
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
+        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end   = (logical + blk_cnt - 1) >>
+                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        while (index <= end) {
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
+                        block_invalidatepage(page, 0);
+                        ClearPageUptodate(page);
+                        unlock_page(page);
+                }
+        }
+        return;
+}
+static void ext4_print_free_blocks(struct inode *inode)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        printk(KERN_EMERG "Total free blocks count %lld\n",
+                        ext4_count_free_blocks(inode->i_sb));
+        printk(KERN_EMERG "Free/Dirty block details\n");
+        printk(KERN_EMERG "free_blocks=%lld\n",
+                        percpu_counter_sum(&sbi->s_freeblocks_counter));
+        printk(KERN_EMERG "dirty_blocks=%lld\n",
+                        percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+        printk(KERN_EMERG "Block reservation details\n");
+        printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+                        EXT4_I(inode)->i_reserved_data_blocks);
+        printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+                        EXT4_I(inode)->i_reserved_meta_blocks);
+        return;
+}
 /*
 * mpage_da_map_blocks - go through given space
 *
@@ -1776,54 +1838,87 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
 *
 * The function skips space we know is already mapped to disk blocks.
 *
- * The function ignores errors ->get_block() returns, thus real
- * error handling is postponed to __mpage_writepage()
 */
-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-        struct buffer_head *lbh = &mpd->lbh;
+        int err = 0;
-        int err = 0, remain = lbh->b_size;
-        sector_t next = lbh->b_blocknr;
        struct buffer_head new;
+        struct buffer_head *lbh = &mpd->lbh;
+        sector_t next;
        /*
         * We consider only non-mapped and non-allocated blocks
         */
        if (buffer_mapped(lbh) && !buffer_delay(lbh))
-                return;
+                return 0;
+        new.b_state = lbh->b_state;
+        new.b_blocknr = 0;
+        new.b_size = lbh->b_size;
+        next = lbh->b_blocknr;
+        /*
+         * If we didn't accumulate anything
+         * to write simply return
+         */
+        if (!new.b_size)
+                return 0;
+        err = mpd->get_block(mpd->inode, next, &new, 1);
+        if (err) {
-        while (remain) {
+                /* If get block returns with error
-                new.b_state = lbh->b_state;
+                 * we simply return. Later writepage
-                new.b_blocknr = 0;
+                 * will redirty the page and writepages
-                new.b_size = remain;
+                 * will find the dirty page again
-                err = mpd->get_block(mpd->inode, next, &new, 1);
+                 */
-                if (err) {
+                if (err == -EAGAIN)
-                        /*
+                        return 0;
-                         * Rather than implement own error handling
-                         * here, we just leave remaining blocks
-                         * unallocated and try again with ->writepage()
-                         */
-                        break;
-                }
-                BUG_ON(new.b_size == 0);
-                if (buffer_new(&new))
+                if (err == -ENOSPC &&
-                        __unmap_underlying_blocks(mpd->inode, &new);
+                                ext4_count_free_blocks(mpd->inode->i_sb)) {
+                        mpd->retval = err;
+                        return 0;
+                }
                /*
-                 * If blocks are delayed marked, we need to
+                 * get block failure will cause us
-                 * put actual blocknr and drop delayed bit
+                 * to loop in writepages. Because
+                 * a_ops->writepage won't be able to
+                 * make progress. The page will be redirtied
+                 * by writepage and writepages will again
+                 * try to write the same.
                 */
-                if (buffer_delay(lbh))
+                printk(KERN_EMERG "%s block allocation failed for inode %lu "
-                        mpage_put_bnr_to_bhs(mpd, next, &new);
+                                  "at logical offset %llu with max blocks "
+                                  "%zd with error %d\n",
-                /* go for the remaining blocks */
+                                  __func__, mpd->inode->i_ino,
-                next += new.b_size >> mpd->inode->i_blkbits;
+                                  (unsigned long long)next,
-                remain -= new.b_size;
+                                  lbh->b_size >> mpd->inode->i_blkbits, err);
+                printk(KERN_EMERG "This should not happen.!! "
+                                        "Data will be lost\n");
+                if (err == -ENOSPC) {
+                        ext4_print_free_blocks(mpd->inode);
+                }
+                /* invlaidate all the pages */
+                ext4_da_block_invalidatepages(mpd, next,
+                                lbh->b_size >> mpd->inode->i_blkbits);
+                return err;
        }
+        BUG_ON(new.b_size == 0);
+        if (buffer_new(&new))
+                __unmap_underlying_blocks(mpd->inode, &new);
+        /*
+         * If blocks are delayed marked, we need to
+         * put actual blocknr and drop delayed bit
+         */
+        if (buffer_delay(lbh) || buffer_unwritten(lbh))
+                mpage_put_bnr_to_bhs(mpd, next, &new);
+        return 0;
 }
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+                (1 << BH_Delay) | (1 << BH_Unwritten))
 /*
 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1837,41 +1932,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
                                   sector_t logical, struct buffer_head *bh)
 {
-        struct buffer_head *lbh = &mpd->lbh;
        sector_t next;
+        size_t b_size = bh->b_size;
+        struct buffer_head *lbh = &mpd->lbh;
+        int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
-        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /* check if thereserved journal credits might overflow */
+        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+                        /*
+                         * With non-extent format we are limited by the journal
+                         * credit available.  Total credit needed to insert
+                         * nrblocks contiguous blocks is dependent on the
+                         * nrblocks.  So limit nrblocks.
+                         */
+                        goto flush_it;
+                } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+                                EXT4_MAX_TRANS_DATA) {
+                        /*
+                         * Adding the new buffer_head would make it cross the
+                         * allowed limit for which we have journal credit
+                         * reserved. So limit the new bh->b_size
+                         */
+                        b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+                                                mpd->inode->i_blkbits;
+                        /* we will do mpage_da_submit_io in the next loop */
+                }
+        }
        /*
         * First block in the extent
         */
        if (lbh->b_size == 0) {
                lbh->b_blocknr = logical;
-                lbh->b_size = bh->b_size;
+                lbh->b_size = b_size;
                lbh->b_state = bh->b_state & BH_FLAGS;
                return;
        }
+        next = lbh->b_blocknr + nrblocks;
        /*
         * Can we merge the block to our big extent?
         */
        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-                lbh->b_size += bh->b_size;
+                lbh->b_size += b_size;
                return;
        }
+flush_it:
        /*
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
-        mpage_da_map_blocks(mpd);
+        if (mpage_da_map_blocks(mpd) == 0)
+                mpage_da_submit_io(mpd);
-        /*
+        mpd->io_done = 1;
-         * Now start a new extent
+        return;
-         */
-        lbh->b_size = bh->b_size;
-        lbh->b_state = bh->b_state & BH_FLAGS;
-        lbh->b_blocknr = logical;
 }
 /*
@@ -1891,17 +2006,35 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head, fake;
        sector_t logical;
+        if (mpd->io_done) {
+                /*
+                 * Rest of the page in the page_vec
+                 * redirty then and skip then. We will
+                 * try to to write them again after
+                 * starting a new transaction
+                 */
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return MPAGE_DA_EXTENT_TAIL;
+        }
        /*
         * Can we merge this page to current extent?
         */
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using __mpage_writepage()
+                 * and start IO on them using writepage()
                 */
                if (mpd->next_page != mpd->first_page) {
-                        mpage_da_map_blocks(mpd);
+                        if (mpage_da_map_blocks(mpd) == 0)
-                        mpage_da_submit_io(mpd);
+                                mpage_da_submit_io(mpd);
+                        /*
+                         * skip rest of the page in the page_vec
+                         */
+                        mpd->io_done = 1;
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return MPAGE_DA_EXTENT_TAIL;
                }
                /*
@@ -1932,6 +2065,8 @@ static int __mpage_da_writepage(struct page *page,
                set_buffer_dirty(bh);
                set_buffer_uptodate(bh);
                mpage_add_bh_to_extent(mpd, logical, bh);
+                if (mpd->io_done)
+                        return MPAGE_DA_EXTENT_TAIL;
        } else {
                /*
                 * Page with regular buffer heads, just add all dirty ones
@@ -1940,8 +2075,12 @@ static int __mpage_da_writepage(struct page *page,
                bh = head;
                do {
                        BUG_ON(buffer_locked(bh));
-                        if (buffer_dirty(bh))
+                        if (buffer_dirty(bh) &&
+                                (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical, bh);
+                                if (mpd->io_done)
+                                        return MPAGE_DA_EXTENT_TAIL;
+                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
        }
@@ -1960,46 +2099,39 @@ static int __mpage_da_writepage(struct page *page,
 *
 * This is a library function, which implements the writepages()
 * address_space_operation.
- *
- * In order to avoid duplication of logic that deals with partial pages,
- * multiple bio per page, etc, we find non-allocated blocks, allocate
- * them with minimal calls to ->get_block() and re-use __mpage_writepage()
- *
- * It's important that we call __mpage_writepage() only once for each
- * involved page, otherwise we'd have to implement more complicated logic
- * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
- *
- * See comments to mpage_writepages()
 */
 static int mpage_da_writepages(struct address_space *mapping,
                               struct writeback_control *wbc,
-                               get_block_t get_block)
+                               struct mpage_da_data *mpd)
 {
-        struct mpage_da_data mpd;
+        long to_write;
        int ret;
-        if (!get_block)
+        if (!mpd->get_block)
                return generic_writepages(mapping, wbc);
-        mpd.wbc = wbc;
+        mpd->lbh.b_size = 0;
-        mpd.inode = mapping->host;
+        mpd->lbh.b_state = 0;
-        mpd.lbh.b_size = 0;
+        mpd->lbh.b_blocknr = 0;
-        mpd.lbh.b_state = 0;
+        mpd->first_page = 0;
-        mpd.lbh.b_blocknr = 0;
+        mpd->next_page = 0;
-        mpd.first_page = 0;
+        mpd->io_done = 0;
-        mpd.next_page = 0;
+        mpd->pages_written = 0;
-        mpd.get_block = get_block;
+        mpd->retval = 0;
+        to_write = wbc->nr_to_write;
-        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
        /*
         * Handle last extent of pages
         */
-        if (mpd.next_page != mpd.first_page) {
+        if (!mpd->io_done && mpd->next_page != mpd->first_page) {
-                mpage_da_map_blocks(&mpd);
+                if (mpage_da_map_blocks(mpd) == 0)
-                mpage_da_submit_io(&mpd);
+                        mpage_da_submit_io(mpd);
        }
+        wbc->nr_to_write = to_write - mpd->pages_written;
        return ret;
 }
@@ -2052,18 +2184,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
        handle_t *handle = NULL;
        handle = ext4_journal_current_handle();
-        if (!handle) {
+        BUG_ON(!handle);
-                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                   bh_result, 0, 0, 0);
+                        bh_result, create, 0, EXT4_DELALLOC_RSVED);
-                BUG_ON(!ret);
-        } else {
-                ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-        }
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
+                if (ext4_should_order_data(inode)) {
+                        int retval;
+                        retval = ext4_jbd2_file_inode(handle, inode);
+                        if (retval)
+                                /*
+                                 * Failed to add inode for ordered
+                                 * mode. Don't update file size
+                                 */
+                                return retval;
+                }
                /*
                 * Update on-disk size along with block allocation
                 * we don't use 'extend_disksize' as size may change
@@ -2073,18 +2211,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
                if (disksize > i_size_read(inode))
                        disksize = i_size_read(inode);
                if (disksize > EXT4_I(inode)->i_disksize) {
-                        /*
+                        ext4_update_i_disksize(inode, disksize);
-                         * XXX: replace with spinlock if seen contended -bzzz
+                        ret = ext4_mark_inode_dirty(handle, inode);
-                         */
+                        return ret;
-                        down_write(&EXT4_I(inode)->i_data_sem);
-                        if (disksize > EXT4_I(inode)->i_disksize)
-                                EXT4_I(inode)->i_disksize = disksize;
-                        up_write(&EXT4_I(inode)->i_data_sem);
-                        if (EXT4_I(inode)->i_disksize == disksize) {
-                                ret = ext4_mark_inode_dirty(handle, inode);
-                                return ret;
-                        }
                }
                ret = 0;
        }
@@ -2204,84 +2333,114 @@ static int ext4_da_writepage(struct page *page,
 }
 /*
- * For now just follow the DIO way to estimate the max credits
+ * This is called via ext4_da_writepages() to
- * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * calulate the total number of credits to reserve to fit
- * todo: need to calculate the max credits need for
+ * a single extent allocation into a single transaction,
- * extent based files, currently the DIO credits is based on
+ * ext4_da_writpeages() will loop calling this before
- * indirect-blocks mapping way.
+ * the block allocation.
- *
- * Probably should have a generic way to calculate credits
- * for DIO, writepages, and truncate
 */
-#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
-#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+        int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        /*
+         * With non-extent format the journal credit needed to
+         * insert nrblocks contiguous block is dependent on
+         * number of contiguous block. So we will limit
+         * number of contiguous block to a sane value
+         */
+        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+            (max_blocks > EXT4_MAX_TRANS_DATA))
+                max_blocks = EXT4_MAX_TRANS_DATA;
+        return ext4_chunk_trans_blocks(inode, max_blocks);
+}
 static int ext4_da_writepages(struct address_space *mapping,
-                                struct writeback_control *wbc)
+                              struct writeback_control *wbc)
 {
-        struct inode *inode = mapping->host;
        handle_t *handle = NULL;
-        int needed_blocks;
-        int ret = 0;
-        long to_write;
        loff_t range_start = 0;
+        struct mpage_da_data mpd;
+        struct inode *inode = mapping->host;
+        int needed_blocks, ret = 0, nr_to_writebump = 0;
+        long to_write, pages_skipped = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
-        if (!mapping->nrpages)
+        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
        /*
-         * Estimate the worse case needed credits to write out
+         * Make sure nr_to_write is >= sbi->s_mb_stream_request
-         * EXT4_MAX_BUF_BLOCKS pages
+         * This make sure small files blocks are allocated in
+         * single attempt. This ensure that small files
+         * get less fragmented.
         */
-        needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+        if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+                nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+                wbc->nr_to_write = sbi->s_mb_stream_request;
+        }
-        to_write = wbc->nr_to_write;
+        if (!wbc->range_cyclic)
-        if (!wbc->range_cyclic) {
                /*
                 * If range_cyclic is not set force range_cont
                 * and save the old writeback_index
                 */
                wbc->range_cont = 1;
-                range_start =  wbc->range_start;
-        }
-        while (!ret && to_write) {
+        range_start =  wbc->range_start;
+        pages_skipped = wbc->pages_skipped;
+        mpd.wbc = wbc;
+        mpd.inode = mapping->host;
+restart_loop:
+        to_write = wbc->nr_to_write;
+        while (!ret && to_write > 0) {
+                /*
+                 * we  insert one extent at a time. So we need
+                 * credit needed for single extent allocation.
+                 * journalled mode is currently not supported
+                 * by delalloc
+                 */
+                BUG_ON(ext4_should_journal_data(inode));
+                needed_blocks = ext4_da_writepages_trans_blocks(inode);
                /* start a new transaction*/
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
+                        printk(KERN_EMERG "%s: jbd2_start: "
+                               "%ld pages, ino %lu; err %d\n", __func__,
+                                wbc->nr_to_write, inode->i_ino, ret);
+                        dump_stack();
                        goto out_writepages;
                }
-                if (ext4_should_order_data(inode)) {
+                to_write -= wbc->nr_to_write;
-                        /*
-                         * With ordered mode we need to add
-                         * the inode to the journal handle
-                         * when we do block allocation.
-                         */
-                        ret = ext4_jbd2_file_inode(handle, inode);
-                        if (ret) {
-                                ext4_journal_stop(handle);
-                                goto out_writepages;
-                        }
-                }
+                mpd.get_block = ext4_da_get_block_write;
-                /*
+                ret = mpage_da_writepages(mapping, wbc, &mpd);
-                 * set the max dirty pages could be write at a time
-                 * to fit into the reserved transaction credits
-                 */
-                if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
-                        wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
-                to_write -= wbc->nr_to_write;
-                ret = mpage_da_writepages(mapping, wbc,
-                                                ext4_da_get_block_write);
                ext4_journal_stop(handle);
-                if (wbc->nr_to_write) {
+                if (mpd.retval == -ENOSPC)
+                        jbd2_journal_force_commit_nested(sbi->s_journal);
+                /* reset the retry count */
+                if (ret == MPAGE_DA_EXTENT_TAIL) {
+                        /*
+                         * got one extent now try with
+                         * rest of the pages
+                         */
+                        to_write += wbc->nr_to_write;
+                        ret = 0;
+                } else if (wbc->nr_to_write) {
                        /*
                         * There is no more writeout needed
                         * or we requested for a noblocking writeout
@@ -2293,13 +2452,48 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = to_write;
        }
-out_writepages:
+        if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-        wbc->nr_to_write = to_write;
+                /* We skipped pages in this loop */
-        if (range_start)
                wbc->range_start = range_start;
+                wbc->nr_to_write = to_write +
+                                wbc->pages_skipped - pages_skipped;
+                wbc->pages_skipped = pages_skipped;
+                goto restart_loop;
+        }
+out_writepages:
+        wbc->nr_to_write = to_write - nr_to_writebump;
+        wbc->range_start = range_start;
        return ret;
 }
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+        s64 free_blocks, dirty_blocks;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        /*
+         * switch to non delalloc mode if we are running low
+         * on free block. The free block accounting via percpu
+         * counters can get slightly wrong with FBC_BATCH getting
+         * accumulated on each CPU without updating global counters
+         * Delalloc need an accurate free block accounting. So switch
+         * to non delalloc when we are near to error range.
+         */
+        free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+        if (2 * free_blocks < 3 * dirty_blocks ||
+                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+                /*
+                 * free block count is less that 150% of dirty blocks
+                 * or free blocks is less that watermark
+                 */
+                return 1;
+        }
+        return 0;
+}
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
@@ -2315,6 +2509,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
+        if (ext4_nonda_switch(inode->i_sb)) {
+                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+                return ext4_write_begin(file, mapping, pos,
+                                        len, flags, pagep, fsdata);
+        }
+        *fsdata = (void *)0;
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2342,6 +2542,13 @@ retry:
                unlock_page(page);
                ext4_journal_stop(handle);
                page_cache_release(page);
+                /*
+                 * block_write_begin may have instantiated a few blocks
+                 * outside i_size.  Trim these off again. Don't need
+                 * i_size_read because we hold i_mutex.
+                 */
+                if (pos + len > inode->i_size)
+                        vmtruncate(inode, inode->i_size);
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2365,7 +2572,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
        bh = page_buffers(page);
        idx = offset >> inode->i_blkbits;
-        for (i=0; i < idx; i++)
+        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;
        if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2383,9 +2590,22 @@ static int ext4_da_write_end(struct file *file,
        handle_t *handle = ext4_journal_current_handle();
        loff_t new_i_size;
        unsigned long start, end;
+        int write_mode = (int)(unsigned long)fsdata;
+        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+                if (ext4_should_order_data(inode)) {
+                        return ext4_ordered_write_end(file, mapping, pos,
+                                        len, copied, page, fsdata);
+                } else if (ext4_should_writeback_data(inode)) {
+                        return ext4_writeback_write_end(file, mapping, pos,
+                                        len, copied, page, fsdata);
+                } else {
+                        BUG();
+                }
+        }
        start = pos & (PAGE_CACHE_SIZE - 1);
-        end = start + copied -1;
+        end = start + copied - 1;
        /*
         * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2409,6 +2629,11 @@ static int ext4_da_write_end(struct file *file,
                                EXT4_I(inode)->i_disksize = new_i_size;
                        }
                        up_write(&EXT4_I(inode)->i_data_sem);
+                        /* We need to mark inode dirty even if
+                         * new_i_size is less that inode->i_size
+                         * bu greater than i_disksize.(hint delalloc)
+                         */
+                        ext4_mark_inode_dirty(handle, inode);
                }
        }
        ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2500,7 +2725,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                        return 0;
        }
-        return generic_block_bmap(mapping,block,ext4_get_block);
+        return generic_block_bmap(mapping, block, ext4_get_block);
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3106,7 +3331,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
        if (!partial->key && *partial->p)
                /* Writer: end */
                goto no_top;
-        for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+        for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
                ;
        /*
         * OK, we've found the last block that must survive. The rest of our
@@ -3125,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
        }
        /* Writer: end */
-        while(partial > p) {
+        while (partial > p) {
                brelse(partial->bh);
                partial--;
        }
@@ -3317,9 +3542,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        /* This zaps the entire block.  Bottom up. */
                        BUFFER_TRACE(bh, "free child branches");
                        ext4_free_branches(handle, inode, bh,
-                                           (__le32*)bh->b_data,
+                                        (__le32 *) bh->b_data,
-                                           (__le32*)bh->b_data + addr_per_block,
+                                        (__le32 *) bh->b_data + addr_per_block,
-                                           depth);
+                                        depth);
                        /*
                         * We've probably journalled the indirect block several
@@ -3486,6 +3711,9 @@ void ext4_truncate(struct inode *inode)
         * modify the block allocation tree.
         */
        down_write(&ei->i_data_sem);
+        ext4_discard_preallocations(inode);
        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
@@ -3555,8 +3783,6 @@ do_indirects:
                ;
        }
-        ext4_discard_reservation(inode);
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -3581,41 +3807,6 @@ out_stop:
        ext4_journal_stop(handle);
 }
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
-                unsigned long ino, struct ext4_iloc *iloc)
-{
-        ext4_group_t block_group;
-        unsigned long offset;
-        ext4_fsblk_t block;
-        struct ext4_group_desc *gdp;
-        if (!ext4_valid_inum(sb, ino)) {
-                /*
-                 * This error is already checked for in namei.c unless we are
-                 * looking at an NFS filehandle, in which case no error
-                 * report is needed
-                 */
-                return 0;
-        }
-        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-        gdp = ext4_get_group_desc(sb, block_group, NULL);
-        if (!gdp)
-                return 0;
-        /*
-         * Figure out the offset within the block group inode table
-         */
-        offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
-                EXT4_INODE_SIZE(sb);
-        block = ext4_inode_table(sb, gdp) +
-                (offset >> EXT4_BLOCK_SIZE_BITS(sb));
-        iloc->block_group = block_group;
-        iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
-        return block;
-}
 /*
 * ext4_get_inode_loc returns with an extra refcount against the inode's
 * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3625,19 +3816,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 static int __ext4_get_inode_loc(struct inode *inode,
                                struct ext4_iloc *iloc, int in_mem)
 {
-        ext4_fsblk_t block;
+        struct ext4_group_desc  *gdp;
-        struct buffer_head *bh;
+        struct buffer_head      *bh;
+        struct super_block      *sb = inode->i_sb;
+        ext4_fsblk_t            block;
+        int                     inodes_per_block, inode_offset;
+        iloc->bh = 0;
+        if (!ext4_valid_inum(sb, inode->i_ino))
+                return -EIO;
-        block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+        iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
-        if (!block)
+        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+        if (!gdp)
                return -EIO;
-        bh = sb_getblk(inode->i_sb, block);
+        /*
+         * Figure out the offset within the block group inode table
+         */
+        inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+        inode_offset = ((inode->i_ino - 1) %
+                        EXT4_INODES_PER_GROUP(sb));
+        block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error (inode->i_sb, "ext4_get_inode_loc",
+                ext4_error(sb, "ext4_get_inode_loc", "unable to read "
-                                "unable to read inode block - "
+                           "inode block - inode=%lu, block=%llu",
-                                "inode=%lu, block=%llu",
+                           inode->i_ino, block);
-                                 inode->i_ino, block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -3665,28 +3872,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
                 */
                if (in_mem) {
                        struct buffer_head *bitmap_bh;
-                        struct ext4_group_desc *desc;
+                        int i, start;
-                        int inodes_per_buffer;
-                        int inode_offset, i;
-                        ext4_group_t block_group;
-                        int start;
-                        block_group = (inode->i_ino - 1) /
-                                        EXT4_INODES_PER_GROUP(inode->i_sb);
-                        inodes_per_buffer = bh->b_size /
-                                EXT4_INODE_SIZE(inode->i_sb);
-                        inode_offset = ((inode->i_ino - 1) %
-                                        EXT4_INODES_PER_GROUP(inode->i_sb));
-                        start = inode_offset & ~(inodes_per_buffer - 1);
-                        /* Is the inode bitmap in cache? */
+                        start = inode_offset & ~(inodes_per_block - 1);
-                        desc = ext4_get_group_desc(inode->i_sb,
-                                                block_group, NULL);
-                        if (!desc)
-                                goto make_io;
-                        bitmap_bh = sb_getblk(inode->i_sb,
+                        /* Is the inode bitmap in cache? */
-                                ext4_inode_bitmap(inode->i_sb, desc));
+                        bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
                        if (!bitmap_bh)
                                goto make_io;
@@ -3699,14 +3890,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
                                brelse(bitmap_bh);
                                goto make_io;
                        }
-                        for (i = start; i < start + inodes_per_buffer; i++) {
+                        for (i = start; i < start + inodes_per_block; i++) {
                                if (i == inode_offset)
                                        continue;
                                if (ext4_test_bit(i, bitmap_bh->b_data))
                                        break;
                        }
                        brelse(bitmap_bh);
-                        if (i == start + inodes_per_buffer) {
+                        if (i == start + inodes_per_block) {
                                /* all other inodes are free, so skip I/O */
                                memset(bh->b_data, 0, bh->b_size);
                                set_buffer_uptodate(bh);
@@ -3717,6 +3908,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
 make_io:
                /*
+                 * If we need to do any I/O, try to pre-readahead extra
+                 * blocks from the inode table.
+                 */
+                if (EXT4_SB(sb)->s_inode_readahead_blks) {
+                        ext4_fsblk_t b, end, table;
+                        unsigned num;
+                        table = ext4_inode_table(sb, gdp);
+                        /* Make sure s_inode_readahead_blks is a power of 2 */
+                        while (EXT4_SB(sb)->s_inode_readahead_blks &
+                               (EXT4_SB(sb)->s_inode_readahead_blks-1))
+                                EXT4_SB(sb)->s_inode_readahead_blks = 
+                                   (EXT4_SB(sb)->s_inode_readahead_blks &
+                                    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+                        b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+                        if (table > b)
+                                b = table;
+                        end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+                        num = EXT4_INODES_PER_GROUP(sb);
+                        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+                                num -= le16_to_cpu(gdp->bg_itable_unused);
+                        table += num / inodes_per_block;
+                        if (end > table)
+                                end = table;
+                        while (b <= end)
+                                sb_breadahead(sb, b++);
+                }
+                /*
                 * There are other valid inodes in the buffer, this inode
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
@@ -3726,10 +3947,9 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(inode->i_sb, "ext4_get_inode_loc",
+                        ext4_error(sb, __func__,
-                                        "unable to read inode block - "
+                                   "unable to read inode block - inode=%lu, "
-                                        "inode=%lu, block=%llu",
+                                   "block=%llu", inode->i_ino, block);
-                                        inode->i_ino, block);
                        brelse(bh);
                        return -EIO;
                }
@@ -3821,11 +4041,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        ei->i_acl = EXT4_ACL_NOT_CACHED;
        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-        ei->i_block_alloc_info = NULL;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -3835,7 +4054,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-        if(!(test_opt (inode->i_sb, NO_UID32))) {
+        if (!(test_opt(inode->i_sb, NO_UID32))) {
                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
@@ -3853,7 +4072,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_mode == 0 ||
                    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
                        /* this inode is deleted */
-                        brelse (bh);
+                        brelse(bh);
                        ret = -ESTALE;
                        goto bad_inode;
                }
@@ -3886,7 +4105,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                    EXT4_INODE_SIZE(inode->i_sb)) {
-                        brelse (bh);
+                        brelse(bh);
                        ret = -EIO;
                        goto bad_inode;
                }
@@ -3939,7 +4158,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        }
-        brelse (iloc.bh);
+        brelse(iloc.bh);
        ext4_set_inode_flags(inode);
        unlock_new_inode(inode);
        return inode;
@@ -4021,14 +4240,14 @@ static int ext4_do_update_inode(handle_t *handle,
        ext4_get_inode_flags(ei);
        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-        if(!(test_opt(inode->i_sb, NO_UID32))) {
+        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
 * Fix up interoperability with old kernels. Otherwise, old inodes get
 * re-used with the upper 16 bits of the uid/gid intact
 */
-                if(!ei->i_dtime) {
+                if (!ei->i_dtime) {
                        raw_inode->i_uid_high =
                                cpu_to_le16(high_16_bits(inode->i_uid));
                        raw_inode->i_gid_high =
@@ -4116,7 +4335,7 @@ static int ext4_do_update_inode(handle_t *handle,
        ei->i_state &= ~EXT4_STATE_NEW;
 out_brelse:
-        brelse (bh);
+        brelse(bh);
        ext4_std_error(inode->i_sb, err);
        return err;
 }
@@ -4324,57 +4543,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
+                                      int chunk)
+{
+        int indirects;
+        /* if nrblocks are contiguous */
+        if (chunk) {
+                /*
+                 * With N contiguous data blocks, it need at most
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * 2 dindirect blocks
+                 * 1 tindirect block
+                 */
+                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return indirects + 3;
+        }
+        /*
+         * if nrblocks are not contiguous, worse case, each block touch
+         * a indirect block, and each indirect block touch a double indirect
+         * block, plus a triple indirect block
+         */
+        indirects = nrblocks * 2 + 1;
+        return indirects;
+}
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                return ext4_indirect_trans_blocks(inode, nrblocks, 0);
+        return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+}
 /*
- * How many blocks doth make a writepage()?
+ * Account for index blocks, block groups bitmaps and block group
- *
+ * descriptor blocks if modify datablocks and index blocks
- * With N blocks per page, it may be:
+ * worse case, the indexs blocks spread over different block groups
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
 *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiugous, with flexbg,
+ * they could still across block group boundary.
 *
- * With ordered or writeback data it's the same, less the N data blocks.
+ * Also account for superblock, inode, quota and xattr blocks
+ */
+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int groups, gdpblocks;
+        int idxblocks;
+        int ret = 0;
+        /*
+         * How many index blocks need to touch to modify nrblocks?
+         * The "Chunk" flag indicating whether the nrblocks is
+         * physically contiguous on disk
+         *
+         * For Direct IO and fallocate, they calls get_block to allocate
+         * one single extent at a time, so they could set the "Chunk" flag
+         */
+        idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+        ret = idxblocks;
+        /*
+         * Now let's see how many group bitmaps and group descriptors need
+         * to account
+         */
+        groups = idxblocks;
+        if (chunk)
+                groups += 1;
+        else
+                groups += nrblocks;
+        gdpblocks = groups;
+        if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+                groups = EXT4_SB(inode->i_sb)->s_groups_count;
+        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+        /* bitmaps and block group descriptor blocks */
+        ret += groups + gdpblocks;
+        /* Blocks for super block, inode, quota and xattr blocks */
+        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+        return ret;
+}
+/*
+ * Calulate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
 *
- * If the inode's direct blocks can hold an integral number of pages then a
+ * This could be called via ext4_write_begin()
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
 *
- * This still overestimates under most circumstances.  If we were to pass the
+ * We need to consider the worse case, when
- * start and end offsets in here as well we could do block_to_path() on each
+ * one new block per extent.
- * block and work out the exact number of indirects which are touched.  Pah.
 */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
        int bpp = ext4_journal_blocks_per_page(inode);
-        int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
        int ret;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        ret = ext4_meta_trans_blocks(inode, bpp, 0);
-                return ext4_ext_writepage_trans_blocks(inode, bpp);
+        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
-                ret = 3 * (bpp + indirects) + 2;
+                ret += bpp;
-        else
-                ret = 2 * (bpp + indirects) + 2;
-#ifdef CONFIG_QUOTA
-        /* We know that structure was already allocated during DQUOT_INIT so
-         * we will be updating only the data blocks + inodes */
-        ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
        return ret;
 }
 /*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+        return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
@@ -4647,6 +4938,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        loff_t size;
        unsigned long len;
        int ret = -EINVAL;
+        void *fsdata;
        struct file *file = vma->vm_file;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
@@ -4685,11 +4977,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
         * on the same page though
         */
        ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
-                        len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+                        len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
        if (ret < 0)
                goto out_unlock;
        ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
-                        len, len, page, NULL);
+                        len, len, page, fsdata);
        if (ret < 0)
                goto out_unlock;
        ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba6..dc99b4776d58 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        struct inode *inode = filp->f_dentry->d_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;
-        unsigned short rsv_window_size;
-        ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+        ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
        switch (cmd) {
        case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return put_user(flags, (int __user *) arg);
        case EXT4_IOC_SETFLAGS: {
                handle_t *handle = NULL;
-                int err;
+                int err, migrate = 0;
                struct ext4_iloc iloc;
                unsigned int oldflags;
                unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        if (!capable(CAP_SYS_RESOURCE))
                                goto flags_out;
                }
+                if (oldflags & EXT4_EXTENTS_FL) {
+                        /* We don't support clearning extent flags */
+                        if (!(flags & EXT4_EXTENTS_FL)) {
+                                err = -EOPNOTSUPP;
+                                goto flags_out;
+                        }
+                } else if (flags & EXT4_EXTENTS_FL) {
+                        /* migrate the file */
+                        migrate = 1;
+                        flags &= ~EXT4_EXTENTS_FL;
+                }
                handle = ext4_journal_start(inode, 1);
                if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
                if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
                        err = ext4_change_inode_journal_flag(inode, jflag);
+                if (err)
+                        goto flags_out;
+                if (migrate)
+                        err = ext4_ext_migrate(inode);
 flags_out:
                mutex_unlock(&inode->i_mutex);
                mnt_drop_write(filp->f_path.mnt);
@@ -175,53 +189,10 @@ setversion_out:
                        return ret;
                }
 #endif
-        case EXT4_IOC_GETRSVSZ:
-                if (test_opt(inode->i_sb, RESERVATION)
-                        && S_ISREG(inode->i_mode)
-                        && ei->i_block_alloc_info) {
-                        rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
-                        return put_user(rsv_window_size, (int __user *)arg);
-                }
-                return -ENOTTY;
-        case EXT4_IOC_SETRSVSZ: {
-                int err;
-                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
-                        return -ENOTTY;
-                if (!is_owner_or_cap(inode))
-                        return -EACCES;
-                if (get_user(rsv_window_size, (int __user *)arg))
-                        return -EFAULT;
-                err = mnt_want_write(filp->f_path.mnt);
-                if (err)
-                        return err;
-                if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
-                        rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
-                /*
-                 * need to allocate reservation structure for this inode
-                 * before set the window size
-                 */
-                down_write(&ei->i_data_sem);
-                if (!ei->i_block_alloc_info)
-                        ext4_init_block_alloc_info(inode);
-                if (ei->i_block_alloc_info){
-                        struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
-                        rsv->rsv_goal_size = rsv_window_size;
-                }
-                up_write(&ei->i_data_sem);
-                mnt_drop_write(filp->f_path.mnt);
-                return 0;
-        }
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
-                int err;
+                int err, err2;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -235,8 +206,10 @@ setversion_out:
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                if (err == 0)
+                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
                return err;
@@ -244,7 +217,7 @@ setversion_out:
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
                struct super_block *sb = inode->i_sb;
-                int err;
+                int err, err2;
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -259,15 +232,36 @@ setversion_out:
                err = ext4_group_add(sb, &input);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                if (err == 0)
+                        err = err2;
                mnt_drop_write(filp->f_path.mnt);
                return err;
        }
        case EXT4_IOC_MIGRATE:
-                return ext4_ext_migrate(inode, filp, cmd, arg);
+        {
+                int err;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
+                /*
+                 * inode_mutex prevent write and truncate on the file.
+                 * Read still goes through. We take i_data_sem in
+                 * ext4_ext_swap_inode_data before we switch the
+                 * inode format to prevent read.
+                 */
+                mutex_lock(&(inode->i_mutex));
+                err = ext4_ext_migrate(inode);
+                mutex_unlock(&(inode->i_mutex));
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
+        }
        default:
                return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 865e9ddb44d4..b580714f0d85 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
-                                printk("corruption in group %lu at byte %u(%u):"
+                                printk(KERN_ERR "corruption in group %lu "
-                                       " %x in copy != %x on disk/prealloc\n",
+                                       "at byte %u(%u): %x in copy != %x "
-                                        e4b->bd_group, i, i * 8, b1[i], b2[i]);
+                                       "on disk/prealloc\n",
+                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
                                BUG();
                        }
                }
@@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
        void *buddy;
        void *buddy2;
-        if (!test_opt(sb, MBALLOC))
-                return 0;
        {
                static int mb_check_counter;
                if (mb_check_counter++ % 100 != 0)
@@ -784,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (bh[i] == NULL)
                        goto out;
-                if (bh_uptodate_or_lock(bh[i]))
+                if (buffer_uptodate(bh[i]) &&
+                    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
                        continue;
+                lock_buffer(bh[i]);
                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
@@ -2169,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        remove_proc_entry("mb_groups", sbi->s_mb_proc);
+        if (sbi->s_proc != NULL) {
-        remove_proc_entry("mb_history", sbi->s_mb_proc);
+                remove_proc_entry("mb_groups", sbi->s_proc);
+                remove_proc_entry("mb_history", sbi->s_proc);
+        }
        kfree(sbi->s_mb_history);
 }
@@ -2180,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int i;
-        if (sbi->s_mb_proc != NULL) {
+        if (sbi->s_proc != NULL) {
-                proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc,
+                proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_history_fops, sb);
-                proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc,
+                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
        }
@@ -2485,19 +2486,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned max;
        int ret;
-        if (!test_opt(sb, MBALLOC))
-                return 0;
        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
-                clear_opt(sbi->s_mount_opt, MBALLOC);
                return -ENOMEM;
        }
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                clear_opt(sbi->s_mount_opt, MBALLOC);
                kfree(sbi->s_mb_maxs);
                return -ENOMEM;
        }
@@ -2520,7 +2516,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0) {
-                clear_opt(sbi->s_mount_opt, MBALLOC);
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
                return ret;
@@ -2540,17 +2535,15 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
-        i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
+        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
-        sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
        if (sbi->s_locality_groups == NULL) {
-                clear_opt(sbi->s_mount_opt, MBALLOC);
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
                return -ENOMEM;
        }
-        for (i = 0; i < nr_cpu_ids; i++) {
+        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
-                lg = &sbi->s_locality_groups[i];
+                lg = per_cpu_ptr(sbi->s_locality_groups, i);
                mutex_init(&lg->lg_mutex);
                for (j = 0; j < PREALLOC_TB_SIZE; j++)
                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
@@ -2560,7 +2553,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
-        printk("EXT4-fs: mballoc enabled\n");
+        printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
        return 0;
 }
@@ -2589,9 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (!test_opt(sb, MBALLOC))
-                return 0;
        /* release freed, non-committed blocks */
        spin_lock(&sbi->s_md_lock);
        list_splice_init(&sbi->s_closed_transaction,
@@ -2647,8 +2637,7 @@ int ext4_mb_release(struct super_block *sb)
                                atomic_read(&sbi->s_mb_discarded));
        }
-        kfree(sbi->s_locality_groups);
+        free_percpu(sbi->s_locality_groups);
        ext4_mb_history_release(sb);
        ext4_mb_destroy_per_dev_proc(sb);
@@ -2721,118 +2710,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 #define EXT4_MB_STREAM_REQ              "stream_req"
 #define EXT4_MB_GROUP_PREALLOC          "group_prealloc"
-#define MB_PROC_FOPS(name)                                      \
-static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)      \
-{                                                               \
-        struct ext4_sb_info *sbi = m->private;                  \
-                                                                \
-        seq_printf(m, "%ld\n", sbi->s_mb_##name);               \
-        return 0;                                               \
-}                                                               \
-                                                                \
-static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-{                                                               \
-        return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
-}                                                               \
-                                                                \
-static ssize_t ext4_mb_##name##_proc_write(struct file *file,   \
-                const char __user *buf, size_t cnt, loff_t *ppos)       \
-{                                                               \
-        struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
-        char str[32];                                           \
-        long value;                                             \
-        if (cnt >= sizeof(str))                                 \
-                return -EINVAL;                                 \
-        if (copy_from_user(str, buf, cnt))                      \
-                return -EFAULT;                                 \
-        value = simple_strtol(str, NULL, 0);                    \
-        if (value <= 0)                                         \
-                return -ERANGE;                                 \
-        sbi->s_mb_##name = value;                               \
-        return cnt;                                             \
-}                                                               \
-                                                                \
-static const struct file_operations ext4_mb_##name##_proc_fops = {      \
-        .owner          = THIS_MODULE,                          \
-        .open           = ext4_mb_##name##_proc_open,           \
-        .read           = seq_read,                             \
-        .llseek         = seq_lseek,                            \
-        .release        = single_release,                       \
-        .write          = ext4_mb_##name##_proc_write,          \
-};
-MB_PROC_FOPS(stats);
-MB_PROC_FOPS(max_to_scan);
-MB_PROC_FOPS(min_to_scan);
-MB_PROC_FOPS(order2_reqs);
-MB_PROC_FOPS(stream_request);
-MB_PROC_FOPS(group_prealloc);
-#define MB_PROC_HANDLER(name, var)                                      \
-do {                                                                    \
-        proc = proc_create_data(name, mode, sbi->s_mb_proc,             \
-                                &ext4_mb_##var##_proc_fops, sbi);       \
-        if (proc == NULL) {                                             \
-                printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
-                goto err_out;                                           \
-        }                                                               \
-} while (0)
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 {
        mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct proc_dir_entry *proc;
-        char devname[64];
-        if (proc_root_ext4 == NULL) {
+        if (sbi->s_proc == NULL)
-                sbi->s_mb_proc = NULL;
                return -EINVAL;
-        }
-        bdevname(sb->s_bdev, devname);
-        sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
-        MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
-        MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
-        MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
-        MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
-        MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
-        MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+        EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
+        EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+        EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+        EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+        EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
+        EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
        return 0;
 err_out:
-        printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-        remove_proc_entry(devname, proc_root_ext4);
-        sbi->s_mb_proc = NULL;
        return -ENOMEM;
 }
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        char devname[64];
-        if (sbi->s_mb_proc == NULL)
+        if (sbi->s_proc == NULL)
                return -EINVAL;
-        bdevname(sb->s_bdev, devname);
+        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-        remove_proc_entry(devname, proc_root_ext4);
        return 0;
 }
@@ -2854,11 +2771,6 @@ int __init init_ext4_mballoc(void)
                kmem_cache_destroy(ext4_pspace_cachep);
                return -ENOMEM;
        }
-#ifdef CONFIG_PROC_FS
-        proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
-        if (proc_root_ext4 == NULL)
-                printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
-#endif
        return 0;
 }
@@ -2867,9 +2779,6 @@ void exit_ext4_mballoc(void)
        /* XXX: synchronize_rcu(); */
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
-        remove_proc_entry("fs/ext4", NULL);
-#endif
 }
@@ -2879,7 +2788,7 @@ void exit_ext4_mballoc(void)
 */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-                                handle_t *handle)
+                                handle_t *handle, unsigned long reserv_blks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_super_block *es;
@@ -2968,15 +2877,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
        /*
-         * free blocks account has already be reduced/reserved
+         * Now reduce the dirty block count also. Should not go negative
-         * at write_begin() time for delayed allocation
-         * do not double accounting
         */
        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
-                percpu_counter_sub(&sbi->s_freeblocks_counter,
+                /* release all the reserved blocks if non delalloc */
-                                        ac->ac_b_ex.fe_len);
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+        else
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                                ac->ac_b_ex.fe_len);
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3282,6 +3192,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 }
 /*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *cpa)
+{
+        ext4_fsblk_t cur_distance, new_distance;
+        if (cpa == NULL) {
+                atomic_inc(&pa->pa_count);
+                return pa;
+        }
+        cur_distance = abs(goal_block - cpa->pa_pstart);
+        new_distance = abs(goal_block - pa->pa_pstart);
+        if (cur_distance < new_distance)
+                return cpa;
+        /* drop the previous reference */
+        atomic_dec(&cpa->pa_count);
+        atomic_inc(&pa->pa_count);
+        return pa;
+}
+/*
 * search goal blocks in preallocated space
 */
 static noinline_for_stack int
@@ -3290,7 +3229,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
-        struct ext4_prealloc_space *pa;
+        struct ext4_prealloc_space *pa, *cpa = NULL;
+        ext4_fsblk_t goal_block;
        /* only data can be preallocated */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3333,6 +3273,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
+        goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+                     ac->ac_g_ex.fe_start +
+                     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+        /*
+         * search for the prealloc space that is having
+         * minimal distance from the goal block.
+         */
        for (i = order; i < PREALLOC_TB_SIZE; i++) {
                rcu_read_lock();
                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
@@ -3340,17 +3287,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        spin_lock(&pa->pa_lock);
                        if (pa->pa_deleted == 0 &&
                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
-                                atomic_inc(&pa->pa_count);
-                                ext4_mb_use_group_pa(ac, pa);
+                                cpa = ext4_mb_check_group_pa(goal_block,
-                                spin_unlock(&pa->pa_lock);
+                                                                pa, cpa);
-                                ac->ac_criteria = 20;
-                                rcu_read_unlock();
-                                return 1;
                        }
                        spin_unlock(&pa->pa_lock);
                }
                rcu_read_unlock();
        }
+        if (cpa) {
+                ext4_mb_use_group_pa(ac, cpa);
+                ac->ac_criteria = 20;
+                return 1;
+        }
        return 0;
 }
@@ -3845,7 +3794,7 @@ out:
 *
 * FIXME!! Make sure it is valid at all the call sites
 */
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
 {
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -3857,7 +3806,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
        struct ext4_buddy e4b;
        int err;
-        if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+        if (!S_ISREG(inode->i_mode)) {
                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
                return;
        }
@@ -4055,8 +4004,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
         * per cpu locality group is to reduce the contention between block
         * request from multiple CPUs.
         */
-        ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
+        ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
-        put_cpu();
        /* we're going to use group allocation */
        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4330,33 +4278,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                                 struct ext4_allocation_request *ar, int *errp)
 {
+        int freed;
        struct ext4_allocation_context *ac = NULL;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
-        int freed;
+        unsigned long inquota;
-        int inquota;
+        unsigned long reserv_blks = 0;
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
-        if (!test_opt(sb, MBALLOC)) {
-                block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
-                                            &(ar->len), errp);
-                return block;
-        }
        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
                /*
                 * With delalloc we already reserved the blocks
                 */
-                ar->len = ext4_has_free_blocks(sbi, ar->len);
+                while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
-        }
+                        /* let others to free the space */
+                        yield();
-        if (ar->len == 0) {
+                        ar->len = ar->len >> 1;
-                *errp = -ENOSPC;
+                }
-                return 0;
+                if (!ar->len) {
+                        *errp = -ENOSPC;
+                        return 0;
+                }
+                reserv_blks = ar->len;
        }
        while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                ar->len--;
@@ -4402,7 +4349,7 @@ repeat:
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-                *errp = ext4_mb_mark_diskspace_used(ac, handle);
+                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
                if (*errp ==  -EAGAIN) {
                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906c2a75..b3b4828f8b89 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -257,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-static struct proc_dir_entry *proc_root_ext4;
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e9..f2a9cf498ecd 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
         * credit. But below we try to not accumalate too much
         * of them by restarting the journal.
         */
-        needed = ext4_ext_calc_credits_for_insert(inode, path);
+        needed = ext4_ext_calc_credits_for_single_extent(inode,
+                    lb->last_block - lb->first_block + 1, path);
        /*
         * Make sure the credit we accumalated is not really high
@@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 }
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
+int ext4_ext_migrate(struct inode *inode)
-                                unsigned int cmd, unsigned long arg)
 {
        handle_t *handle;
        int retval = 0, i;
@@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
         * when we add extents we extent the journal
         */
        /*
-         * inode_mutex prevent write and truncate on the file. Read still goes
-         * through. We take i_data_sem in ext4_ext_swap_inode_data before we
-         * switch the inode format to prevent read.
-         */
-        mutex_lock(&(inode->i_mutex));
-        /*
         * Even though we take i_mutex we can still cause block allocation
         * via mmap write to holes. If we have allocated new blocks we fail
         * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -622,7 +616,6 @@ err_out:
        tmp_inode->i_nlink = 0;
        ext4_journal_stop(handle);
-        mutex_unlock(&(inode->i_mutex));
        if (tmp_inode)
                iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c3..92db9e945147 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@ struct dx_map_entry
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
+static inline unsigned dx_get_hash(struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
+static unsigned dx_get_count(struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
+static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static struct dx_frame *dx_probe(const struct qstr *d_name,
                                 struct inode *dir,
                                 struct dx_hash_info *hinfo,
                                 struct dx_frame *frame,
                                 int *err);
-static void dx_release (struct dx_frame *frames);
+static void dx_release(struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
-                        struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
                struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
 static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
                                 __u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
-                       struct ext4_dir_entry_2 **res_dir, int *err);
+                const struct qstr *d_name,
+                struct ext4_dir_entry_2 **res_dir,
+                int *err);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
@@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
        entry->block = cpu_to_le32(value);
 }
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
 {
        return le32_to_cpu(entry->hash);
 }
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
 {
        entry->hash = cpu_to_le32(value);
 }
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
 {
        return le16_to_cpu(((struct dx_countlimit *) entries)->count);
 }
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
 {
        return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
 }
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
 {
        ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
 }
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
 {
        ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
 }
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;
        return entry_space / sizeof(struct dx_entry);
 }
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
        return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
 * Debug
 */
 #ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
 {
        int i, n = dx_get_count (entries);
-        printk("%s index ", label);
+        printk(KERN_DEBUG "%s index ", label);
        for (i = 0; i < n; i++) {
-                printk("%x->%lu ", i? dx_get_hash(entries + i) :
+                printk("%x->%lu ", i ? dx_get_hash(entries + i) :
                                0, (unsigned long)dx_get_block(entries + i));
        }
        printk("\n");
@@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                             struct dx_entry *entries, int levels)
 {
        unsigned blocksize = dir->i_sb->s_blocksize;
-        unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+        unsigned count = dx_get_count(entries), names = 0, space = 0, i;
        unsigned bcount = 0;
        struct buffer_head *bh;
        int err;
@@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                names += stats.names;
                space += stats.space;
                bcount += stats.bcount;
-                brelse (bh);
+                brelse(bh);
        }
        if (bcount)
-                printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
+                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
-                        names, space/bcount,(space/bcount)*100/blocksize);
+                       levels ? "" : "   ", names, space/bcount,
+                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
 }
 #endif /* DX_DEBUG */
@@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 * back to userspace.
 */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
         struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
        unsigned count, indirect;
@@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
        u32 hash;
        frame->bh = NULL;
-        if (dentry)
-                dir = dentry->d_parent->d_inode;
        if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
                goto fail;
        root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
        }
        hinfo->hash_version = root->info.hash_version;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-        if (dentry)
+        if (d_name)
-                ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+                ext4fs_dirhash(d_name->name, d_name->len, hinfo);
        hash = hinfo->hash;
        if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
                goto fail;
        }
-        dxtrace (printk("Look up %x", hash));
+        dxtrace(printk("Look up %x", hash));
        while (1)
        {
                count = dx_get_count(entries);
@@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                      0, &err)))
                        return err; /* Failure */
                p++;
-                brelse (p->bh);
+                brelse(p->bh);
                p->bh = bh;
                p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
        }
@@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                        /* On error, skip the f_pos to the next block. */
                        dir_file->f_pos = (dir_file->f_pos |
                                        (dir->i_sb->s_blocksize - 1)) + 1;
-                        brelse (bh);
+                        brelse(bh);
                        return count;
                }
                ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        int ret, err;
        __u32 hashval;
-        dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
-                       start_minor_hash));
+                       start_hash, start_minor_hash));
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
        hinfo.hash = start_hash;
        hinfo.minor_hash = 0;
-        frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+        frame = dx_probe(NULL, dir, &hinfo, frames, &err);
        if (!frame)
                return err;
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                        break;
        }
        dx_release(frames);
-        dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+        dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
-                       count, *next_hash));
+                       "next hash: %x\n", count, *next_hash));
        return count;
 errout:
        dx_release(frames);
@@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
 /*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
-static inline int search_dirblock(struct buffer_head * bh,
+static inline int search_dirblock(struct buffer_head *bh,
                                  struct inode *dir,
-                                  struct dentry *dentry,
+                                  const struct qstr *d_name,
                                  unsigned long offset,
                                  struct ext4_dir_entry_2 ** res_dir)
 {
        struct ext4_dir_entry_2 * de;
        char * dlimit;
        int de_len;
-        const char *name = dentry->d_name.name;
+        const char *name = d_name->name;
-        int namelen = dentry->d_name.len;
+        int namelen = d_name->len;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
 * The returned buffer_head has ->b_count elevated.  The caller is expected
 * to brelse() it when appropriate.
 */
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+                                        const struct qstr *d_name,
                                        struct ext4_dir_entry_2 ** res_dir)
 {
-        struct super_block * sb;
+        struct super_block *sb;
-        struct buffer_head * bh_use[NAMEI_RA_SIZE];
+        struct buffer_head *bh_use[NAMEI_RA_SIZE];
-        struct buffer_head * bh, *ret = NULL;
+        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block, b;
        int ra_max = 0;         /* Number of bh's in the readahead
                                   buffer, bh_use[] */
@@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
        int num = 0;
        ext4_lblk_t  nblocks;
        int i, err;
-        struct inode *dir = dentry->d_parent->d_inode;
        int namelen;
        *res_dir = NULL;
        sb = dir->i_sb;
-        namelen = dentry->d_name.len;
+        namelen = d_name->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;
        if (is_dx(dir)) {
-                bh = ext4_dx_find_entry(dentry, res_dir, &err);
+                bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                /*
                 * On success, or if the error was file not found,
                 * return.  Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
                 */
                if (bh || (err != ERR_BAD_DX_DIR))
                        return bh;
-                dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+                               "falling back\n"));
        }
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@ restart:
                        brelse(bh);
                        goto next;
                }
-                i = search_dirblock(bh, dir, dentry,
+                i = search_dirblock(bh, dir, d_name,
                            block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
                if (i == 1) {
                        EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@ restart:
 cleanup_and_exit:
        /* Clean up the read-ahead blocks */
        for (; ra_ptr < ra_max; ra_ptr++)
-                brelse (bh_use[ra_ptr]);
+                brelse(bh_use[ra_ptr]);
        return ret;
 }
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                       struct ext4_dir_entry_2 **res_dir, int *err)
 {
        struct super_block * sb;
@@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;
-        int namelen = dentry->d_name.len;
+        int namelen = d_name->len;
-        const u8 *name = dentry->d_name.name;
+        const u8 *name = d_name->name;
-        struct inode *dir = dentry->d_parent->d_inode;
        sb = dir->i_sb;
        /* NFS may look up ".." - look at dx_root directory block */
        if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-                if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+                if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
                        return NULL;
        } else {
                frame = frames;
@@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
                                return bh;
                        }
                }
-                brelse (bh);
+                brelse(bh);
                /* Check to see if we should continue to search */
                retval = ext4_htree_next_block(dir, hash, frame,
                                               frames, NULL);
@@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
        *err = -ENOENT;
 errout:
-        dxtrace(printk("%s not found\n", name));
+        dxtrace(printk(KERN_DEBUG "%s not found\n", name));
        dx_release (frames);
        return NULL;
 }
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-        struct inode * inode;
+        struct inode *inode;
-        struct ext4_dir_entry_2 * de;
+        struct ext4_dir_entry_2 *de;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        if (dentry->d_name.len > EXT4_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
-        bh = ext4_find_entry(dentry, &de);
+        bh = ext4_find_entry(dir, &dentry->d_name, &de);
        inode = NULL;
        if (bh) {
                unsigned long ino = le32_to_cpu(de->inode);
-                brelse (bh);
+                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
                        ext4_error(dir->i_sb, "ext4_lookup",
                                   "bad inode number: %lu", ino);
@@ -1062,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
        unsigned long ino;
        struct dentry *parent;
        struct inode *inode;
-        struct dentry dotdot;
+        static const struct qstr dotdot = {
+                .name = "..",
+                .len = 2,
+        };
        struct ext4_dir_entry_2 * de;
        struct buffer_head *bh;
-        dotdot.d_name.name = "..";
+        bh = ext4_find_entry(child->d_inode, &dotdot, &de);
-        dotdot.d_name.len = 2;
-        dotdot.d_parent = child; /* confusing, isn't it! */
-        bh = ext4_find_entry(&dotdot, &de);
        inode = NULL;
        if (!bh)
                return ERR_PTR(-ENOENT);
@@ -1201,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        /* create map in the end of data2 block */
        map = (struct dx_map_entry *) (data2 + blocksize);
-        count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+        count = dx_make_map((struct ext4_dir_entry_2 *) data1,
                             blocksize, hinfo, map);
        map -= count;
-        dx_sort_map (map, count);
+        dx_sort_map(map, count);
        /* Split the existing block in the middle, size-wise */
        size = 0;
        move = 0;
@@ -1225,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        /* Fancy dance to stay within two buffers */
        de2 = dx_move_dirents(data1, data2, map + split, count - split);
-        de = dx_pack_dirents(data1,blocksize);
+        de = dx_pack_dirents(data1, blocksize);
        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
        de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                swap(*bh, bh2);
                de = de2;
        }
-        dx_insert_block (frame, hash2 + continued, newblock);
+        dx_insert_block(frame, hash2 + continued, newblock);
-        err = ext4_journal_dirty_metadata (handle, bh2);
+        err = ext4_journal_dirty_metadata(handle, bh2);
        if (err)
                goto journal_error;
-        err = ext4_journal_dirty_metadata (handle, frame->bh);
+        err = ext4_journal_dirty_metadata(handle, frame->bh);
        if (err)
                goto journal_error;
-        brelse (bh2);
+        brelse(bh2);
-        dxtrace(dx_show_index ("frame", frame->entries));
+        dxtrace(dx_show_index("frame", frame->entries));
        return de;
 journal_error:
@@ -1271,7 +1271,7 @@ errout:
 */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                             struct inode *inode, struct ext4_dir_entry_2 *de,
-                             struct buffer_head * bh)
+                             struct buffer_head *bh)
 {
        struct inode    *dir = dentry->d_parent->d_inode;
        const char      *name = dentry->d_name.name;
@@ -1288,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
                                                  bh, offset)) {
-                                brelse (bh);
+                                brelse(bh);
                                return -EIO;
                        }
-                        if (ext4_match (namelen, name, de)) {
+                        if (ext4_match(namelen, name, de)) {
-                                brelse (bh);
+                                brelse(bh);
                                return -EEXIST;
                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        } else
                de->inode = 0;
        de->name_len = namelen;
-        memcpy (de->name, name, namelen);
+        memcpy(de->name, name, namelen);
        /*
         * XXX shouldn't update any times until successful
         * completion of syscall, but too many callers depend
@@ -1377,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct fake_dirent *fde;
        blocksize =  dir->i_sb->s_blocksize;
-        dxtrace(printk("Creating index\n"));
+        dxtrace(printk(KERN_DEBUG "Creating index\n"));
        retval = ext4_journal_get_write_access(handle, bh);
        if (retval) {
                ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        }
        root = (struct dx_root *) bh->b_data;
-        bh2 = ext4_append (handle, dir, &block, &retval);
+        bh2 = ext4_append(handle, dir, &block, &retval);
        if (!(bh2)) {
                brelse(bh);
                return retval;
@@ -1412,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
        entries = root->entries;
-        dx_set_block (entries, 1);
+        dx_set_block(entries, 1);
-        dx_set_count (entries, 1);
+        dx_set_count(entries, 1);
-        dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+        dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 * may not sleep between calling this and putting something into
 * the entry, as someone else might have used it while you slept.
 */
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
-        struct inode *inode)
+                          struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
        unsigned long offset;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
-        struct super_block * sb;
+        struct super_block *sb;
        int     retval;
        int     dx_fallback=0;
        unsigned blocksize;
@@ -1500,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        struct dx_frame frames[2], *frame;
        struct dx_entry *entries, *at;
        struct dx_hash_info hinfo;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        struct inode *dir = dentry->d_parent->d_inode;
-        struct super_block * sb = dir->i_sb;
+        struct super_block *sb = dir->i_sb;
        struct ext4_dir_entry_2 *de;
        int err;
-        frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+        frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
        if (!frame)
                return err;
        entries = frame->entries;
@@ -1527,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        }
        /* Block full, should compress but for now just split */
-        dxtrace(printk("using %u of %u node entries\n",
+        dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
                       dx_get_count(entries), dx_get_limit(entries)));
        /* Need to split index? */
        if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                if (levels) {
                        unsigned icount1 = icount/2, icount2 = icount - icount1;
                        unsigned hash2 = dx_get_hash(entries + icount1);
-                        dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+                        dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+                                       icount1, icount2));
                        BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
                        err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
-                        memcpy ((char *) entries2, (char *) (entries + icount1),
+                        memcpy((char *) entries2, (char *) (entries + icount1),
-                                icount2 * sizeof(struct dx_entry));
+                               icount2 * sizeof(struct dx_entry));
-                        dx_set_count (entries, icount1);
+                        dx_set_count(entries, icount1);
-                        dx_set_count (entries2, icount2);
+                        dx_set_count(entries2, icount2);
-                        dx_set_limit (entries2, dx_node_limit(dir));
+                        dx_set_limit(entries2, dx_node_limit(dir));
                        /* Which index block gets the new entry? */
                        if (at - entries >= icount1) {
@@ -1579,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                                frame->entries = entries = entries2;
                                swap(frame->bh, bh2);
                        }
-                        dx_insert_block (frames + 0, hash2, newblock);
+                        dx_insert_block(frames + 0, hash2, newblock);
-                        dxtrace(dx_show_index ("node", frames[1].entries));
+                        dxtrace(dx_show_index("node", frames[1].entries));
-                        dxtrace(dx_show_index ("node",
+                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
                        err = ext4_journal_dirty_metadata(handle, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
                } else {
-                        dxtrace(printk("Creating second level index...\n"));
+                        dxtrace(printk(KERN_DEBUG
+                                       "Creating second level index...\n"));
                        memcpy((char *) entries2, (char *) entries,
                               icount * sizeof(struct dx_entry));
                        dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1632,12 @@ cleanup:
 * ext4_delete_entry deletes a directory entry by merging it with the
 * previous entry
 */
-static int ext4_delete_entry (handle_t *handle,
+static int ext4_delete_entry(handle_t *handle,
-                              struct inode * dir,
+                             struct inode *dir,
-                              struct ext4_dir_entry_2 * de_del,
+                             struct ext4_dir_entry_2 *de_del,
-                              struct buffer_head * bh)
+                             struct buffer_head *bh)
 {
-        struct ext4_dir_entry_2 * de, * pde;
+        struct ext4_dir_entry_2 *de, *pde;
        int i;
        i = 0;
@@ -1716,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle,
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
+static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
-                struct nameidata *nd)
+                       struct nameidata *nd)
 {
        handle_t *handle;
-        struct inode * inode;
+        struct inode *inode;
        int err, retries = 0;
 retry:
@@ -1747,8 +1749,8 @@ retry:
        return err;
 }
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
-                        int mode, dev_t rdev)
+                      int mode, dev_t rdev)
 {
        handle_t *handle;
        struct inode *inode;
@@ -1767,11 +1769,11 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext4_new_inode (handle, dir, mode);
+        inode = ext4_new_inode(handle, dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
                inode->i_op = &ext4_special_inode_operations;
 #endif
                err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1784,12 @@ retry:
        return err;
 }
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        handle_t *handle;
-        struct inode * inode;
+        struct inode *inode;
-        struct buffer_head * dir_block;
+        struct buffer_head *dir_block;
-        struct ext4_dir_entry_2 * de;
+        struct ext4_dir_entry_2 *de;
        int err, retries = 0;
        if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1805,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+        inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -1811,7 +1813,7 @@ retry:
        inode->i_op = &ext4_dir_inode_operations;
        inode->i_fop = &ext4_dir_operations;
        inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-        dir_block = ext4_bread (handle, inode, 0, 1, &err);
+        dir_block = ext4_bread(handle, inode, 0, 1, &err);
        if (!dir_block)
                goto out_clear_inode;
        BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1822,26 @@ retry:
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
-        strcpy (de->name, ".");
+        strcpy(de->name, ".");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        de = ext4_next_entry(de);
        de->inode = cpu_to_le32(dir->i_ino);
        de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
                                                EXT4_DIR_REC_LEN(1));
        de->name_len = 2;
-        strcpy (de->name, "..");
+        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
        BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
        ext4_journal_dirty_metadata(handle, dir_block);
-        brelse (dir_block);
+        brelse(dir_block);
        ext4_mark_inode_dirty(handle, inode);
-        err = ext4_add_entry (handle, dentry, inode);
+        err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
                ext4_mark_inode_dirty(handle, inode);
-                iput (inode);
+                iput(inode);
                goto out_stop;
        }
        ext4_inc_count(handle, dir);
@@ -1856,17 +1858,17 @@ out_stop:
 /*
 * routine to check that the specified directory is empty (for rmdir)
 */
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
 {
        unsigned long offset;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct ext4_dir_entry_2 * de, * de1;
+        struct ext4_dir_entry_2 *de, *de1;
-        struct super_block * sb;
+        struct super_block *sb;
        int err = 0;
        sb = inode->i_sb;
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-            !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
                        ext4_error(inode->i_sb, __func__,
                                   "error %d reading directory #%lu offset 0",
@@ -1881,23 +1883,23 @@ static int empty_dir (struct inode * inode)
        de1 = ext4_next_entry(de);
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
-                        strcmp (".", de->name) ||
+                        strcmp(".", de->name) ||
-                        strcmp ("..", de1->name)) {
+                        strcmp("..", de1->name)) {
-                ext4_warning (inode->i_sb, "empty_dir",
+                ext4_warning(inode->i_sb, "empty_dir",
-                              "bad directory (dir #%lu) - no `.' or `..'",
+                             "bad directory (dir #%lu) - no `.' or `..'",
-                              inode->i_ino);
+                             inode->i_ino);
-                brelse (bh);
+                brelse(bh);
                return 1;
        }
        offset = ext4_rec_len_from_disk(de->rec_len) +
                 ext4_rec_len_from_disk(de1->rec_len);
        de = ext4_next_entry(de1);
-        while (offset < inode->i_size ) {
+        while (offset < inode->i_size) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
                        err = 0;
-                        brelse (bh);
+                        brelse(bh);
-                        bh = ext4_bread (NULL, inode,
+                        bh = ext4_bread(NULL, inode,
                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
                        if (!bh) {
                                if (err)
@@ -1917,13 +1919,13 @@ static int empty_dir (struct inode * inode)
                        continue;
                }
                if (le32_to_cpu(de->inode)) {
-                        brelse (bh);
+                        brelse(bh);
                        return 0;
                }
                offset += ext4_rec_len_from_disk(de->rec_len);
                de = ext4_next_entry(de);
        }
-        brelse (bh);
+        brelse(bh);
        return 1;
 }
@@ -1954,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
         */
-        J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-                S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2071,12 @@ out_brelse:
        goto out_err;
 }
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int retval;
-        struct inode * inode;
+        struct inode *inode;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct ext4_dir_entry_2 * de;
+        struct ext4_dir_entry_2 *de;
        handle_t *handle;
        /* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
                return PTR_ERR(handle);
        retval = -ENOENT;
-        bh = ext4_find_entry (dentry, &de);
+        bh = ext4_find_entry(dir, &dentry->d_name, &de);
        if (!bh)
                goto end_rmdir;
@@ -2099,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
                goto end_rmdir;
        retval = -ENOTEMPTY;
-        if (!empty_dir (inode))
+        if (!empty_dir(inode))
                goto end_rmdir;
        retval = ext4_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_rmdir;
        if (!EXT4_DIR_LINK_EMPTY(inode))
-                ext4_warning (inode->i_sb, "ext4_rmdir",
+                ext4_warning(inode->i_sb, "ext4_rmdir",
-                              "empty directory has too many links (%d)",
+                             "empty directory has too many links (%d)",
-                              inode->i_nlink);
+                             inode->i_nlink);
        inode->i_version++;
        clear_nlink(inode);
        /* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 end_rmdir:
        ext4_journal_stop(handle);
-        brelse (bh);
+        brelse(bh);
        return retval;
 }
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 {
        int retval;
-        struct inode * inode;
+        struct inode *inode;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
-        struct ext4_dir_entry_2 * de;
+        struct ext4_dir_entry_2 *de;
        handle_t *handle;
        /* Initialize quotas before so that eventual writes go
@@ -2147,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
                handle->h_sync = 1;
        retval = -ENOENT;
-        bh = ext4_find_entry (dentry, &de);
+        bh = ext4_find_entry(dir, &dentry->d_name, &de);
        if (!bh)
                goto end_unlink;
@@ -2158,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
                goto end_unlink;
        if (!inode->i_nlink) {
-                ext4_warning (inode->i_sb, "ext4_unlink",
+                ext4_warning(inode->i_sb, "ext4_unlink",
-                              "Deleting nonexistent file (%lu), %d",
+                             "Deleting nonexistent file (%lu), %d",
-                              inode->i_ino, inode->i_nlink);
+                             inode->i_ino, inode->i_nlink);
                inode->i_nlink = 1;
        }
        retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 end_unlink:
        ext4_journal_stop(handle);
-        brelse (bh);
+        brelse(bh);
        return retval;
 }
-static int ext4_symlink (struct inode * dir,
+static int ext4_symlink(struct inode *dir,
-                struct dentry *dentry, const char * symname)
+                        struct dentry *dentry, const char *symname)
 {
        handle_t *handle;
-        struct inode * inode;
+        struct inode *inode;
        int l, err, retries = 0;
        l = strlen(symname)+1;
@@ -2203,12 +2205,12 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
-        if (l > sizeof (EXT4_I(inode)->i_data)) {
+        if (l > sizeof(EXT4_I(inode)->i_data)) {
                inode->i_op = &ext4_symlink_inode_operations;
                ext4_set_aops(inode);
                /*
@@ -2221,14 +2223,14 @@ retry:
                if (err) {
                        clear_nlink(inode);
                        ext4_mark_inode_dirty(handle, inode);
-                        iput (inode);
+                        iput(inode);
                        goto out_stop;
                }
        } else {
                /* clear the extent format for fast symlink */
                EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
                inode->i_op = &ext4_fast_symlink_inode_operations;
-                memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+                memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
                inode->i_size = l-1;
        }
        EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2242,8 @@ out_stop:
        return err;
 }
-static int ext4_link (struct dentry * old_dentry,
+static int ext4_link(struct dentry *old_dentry,
-                struct inode * dir, struct dentry *dentry)
+                     struct inode *dir, struct dentry *dentry)
 {
        handle_t *handle;
        struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2286,13 @@ retry:
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
 */
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
-                           struct inode * new_dir,struct dentry *new_dentry)
+                       struct inode *new_dir, struct dentry *new_dentry)
 {
        handle_t *handle;
-        struct inode * old_inode, * new_inode;
+        struct inode *old_inode, *new_inode;
-        struct buffer_head * old_bh, * new_bh, * dir_bh;
+        struct buffer_head *old_bh, *new_bh, *dir_bh;
-        struct ext4_dir_entry_2 * old_de, * new_de;
+        struct ext4_dir_entry_2 *old_de, *new_de;
        int retval;
        old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
                handle->h_sync = 1;
-        old_bh = ext4_find_entry (old_dentry, &old_de);
+        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
                goto end_rename;
        new_inode = new_dentry->d_inode;
-        new_bh = ext4_find_entry (new_dentry, &new_de);
+        new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
        if (new_bh) {
                if (!new_inode) {
-                        brelse (new_bh);
+                        brelse(new_bh);
                        new_bh = NULL;
                }
        }
        if (S_ISDIR(old_inode->i_mode)) {
                if (new_inode) {
                        retval = -ENOTEMPTY;
-                        if (!empty_dir (new_inode))
+                        if (!empty_dir(new_inode))
                                goto end_rename;
                }
                retval = -EIO;
-                dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+                dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                if (!dir_bh)
                        goto end_rename;
                if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
-                if (!new_inode && new_dir!=old_dir &&
+                if (!new_inode && new_dir != old_dir &&
                                new_dir->i_nlink >= EXT4_LINK_MAX)
                        goto end_rename;
        }
        if (!new_bh) {
-                retval = ext4_add_entry (handle, new_dentry, old_inode);
+                retval = ext4_add_entry(handle, new_dentry, old_inode);
                if (retval)
                        goto end_rename;
        } else {
@@ -2388,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
                struct buffer_head *old_bh2;
                struct ext4_dir_entry_2 *old_de2;
-                old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+                old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
                if (old_bh2) {
                        retval = ext4_delete_entry(handle, old_dir,
                                                   old_de2, old_bh2);
@@ -2433,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
        retval = 0;
 end_rename:
-        brelse (dir_bh);
+        brelse(dir_bh);
-        brelse (old_bh);
+        brelse(old_bh);
-        brelse (new_bh);
+        brelse(new_bh);
        ext4_journal_stop(handle);
        return retval;
 }
@@ -2454,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .mknod          = ext4_mknod,
        .rename         = ext4_rename,
        .setattr        = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
@@ -2465,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 const struct inode_operations ext4_special_inode_operations = {
        .setattr        = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 0a9265164265..b6ec1843a015 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
                       gdb_num);
-        /*
+        /*
-         * If we are not using the primary superblock/GDT copy don't resize,
+         * If we are not using the primary superblock/GDT copy don't resize,
         * because the user tools have no way of handling this.  Probably a
         * bad time to do it anyways.
         */
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (reserved_gdb || gdb_off == 0) {
                if (!EXT4_HAS_COMPAT_FEATURE(sb,
-                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)
+                    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
                        ext4_warning(sb, __func__,
                                     "No reserved GDT blocks, can't resize");
                        return -EPERM;
@@ -869,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         * We can allocate memory for mb_alloc based on the new group
         * descriptor
         */
-        if (test_opt(sb, MBALLOC)) {
+        err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-                err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+        if (err)
-                if (err)
+                goto exit_journal;
-                        goto exit_journal;
-        }
        /*
         * Make the new blocks and inodes valid next.  We do this before
         * increasing the group count so that once the group is enabled,
@@ -928,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+                ext4_group_t flex_group;
+                flex_group = ext4_flex_group(sbi, input->group);
+                sbi->s_flex_groups[flex_group].free_blocks +=
+                        input->free_blocks_count;
+                sbi->s_flex_groups[flex_group].free_inodes +=
+                        EXT4_INODES_PER_GROUP(sb);
+        }
        ext4_journal_dirty_metadata(handle, sbi->s_sbh);
        sb->s_dirt = 1;
@@ -963,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        ext4_group_t o_groups_count;
        ext4_grpblk_t last;
        ext4_grpblk_t add;
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        handle_t *handle;
        int err;
        unsigned long freed_blocks;
@@ -1076,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        /*
         * Mark mballoc pages as not up to date so that they will be updated
         * next time they are loaded by ext4_mb_load_buddy.
+         *
+         * XXX Bad, Bad, BAD!!!  We should not be overloading the
+         * Uptodate flag, particularly on thte bitmap bh, as way of
+         * hinting to ext4_mb_load_buddy() that it needs to be
+         * overloaded.  A user could take a LVM snapshot, then do an
+         * on-line fsck, and clear the uptodate flag, and this would
+         * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
         */
-        if (test_opt(sb, MBALLOC)) {
+        {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                struct inode *inode = sbi->s_buddy_cache;
                int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d5d77958b861..dea8f13c2fd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -45,6 +47,8 @@
 #include "namei.h"
 #include "group.h"
+struct proc_dir_entry *ext4_proc_root;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
@@ -503,15 +507,18 @@ static void ext4_put_super(struct super_block *sb)
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
-        jbd2_journal_destroy(sbi->s_journal);
+        if (jbd2_journal_destroy(sbi->s_journal) < 0)
+                ext4_abort(sb, __func__, "Couldn't clean up the journal");
        sbi->s_journal = NULL;
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-                mark_buffer_dirty(sbi->s_sbh);
                ext4_commit_super(sb, es, 1);
        }
+        if (sbi->s_proc) {
+                remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+                remove_proc_entry(sb->s_id, ext4_proc_root);
+        }
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -520,6 +527,7 @@ static void ext4_put_super(struct super_block *sb)
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < MAXQUOTAS; i++)
@@ -562,12 +570,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        ei->i_acl = EXT4_ACL_NOT_CACHED;
        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
+        ei->vfs_inode.i_data.writeback_index = 0;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
@@ -598,7 +606,7 @@ static void init_once(void *foo)
        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
        INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        init_rwsem(&ei->xattr_sem);
 #endif
        init_rwsem(&ei->i_data_sem);
@@ -624,8 +632,7 @@ static void destroy_inodecache(void)
 static void ext4_clear_inode(struct inode *inode)
 {
-        struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
        if (EXT4_I(inode)->i_acl &&
                        EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
                posix_acl_release(EXT4_I(inode)->i_acl);
@@ -637,10 +644,7 @@ static void ext4_clear_inode(struct inode *inode)
                EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
        }
 #endif
-        ext4_discard_reservation(inode);
+        ext4_discard_preallocations(inode);
-        EXT4_I(inode)->i_block_alloc_info = NULL;
-        if (unlikely(rsv))
-                kfree(rsv);
        jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
                                       &EXT4_I(inode)->jinode);
 }
@@ -653,7 +657,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
        if (sbi->s_jquota_fmt)
                seq_printf(seq, ",jqfmt=%s",
-                (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+                (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
        if (sbi->s_qf_names[USRQUOTA])
                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -717,7 +721,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",debug");
        if (test_opt(sb, OLDALLOC))
                seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        if (test_opt(sb, XATTR_USER) &&
                !(def_mount_opts & EXT4_DEFM_XATTR_USER))
                seq_puts(seq, ",user_xattr");
@@ -726,7 +730,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nouser_xattr");
        }
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
                seq_puts(seq, ",acl");
        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -751,8 +755,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",nobh");
        if (!test_opt(sb, EXTENTS))
                seq_puts(seq, ",noextents");
-        if (!test_opt(sb, MBALLOC))
-                seq_puts(seq, ",nomballoc");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
        if (!test_opt(sb, DELALLOC))
@@ -772,6 +774,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                seq_puts(seq, ",data=writeback");
+        if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+                seq_printf(seq, ",inode_readahead_blks=%u",
+                           sbi->s_inode_readahead_blks);
+        if (test_opt(sb, DATA_ERR_ABORT))
+                seq_puts(seq, ",data_err=abort");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
@@ -821,7 +830,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 static int ext4_dquot_initialize(struct inode *inode, int type);
@@ -901,14 +910,16 @@ enum {
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
        Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_inode_readahead_blks
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_bsd_df, "bsddf"},
        {Opt_minix_df, "minixdf"},
        {Opt_grpid, "grpid"},
@@ -946,6 +957,8 @@ static match_table_t tokens = {
        {Opt_data_journal, "data=journal"},
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
+        {Opt_data_err_abort, "data_err=abort"},
+        {Opt_data_err_ignore, "data_err=ignore"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -966,6 +979,7 @@ static match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_err, NULL},
 };
@@ -980,7 +994,7 @@ static ext4_fsblk_t get_sb_block(void **data)
        /*todo: use simple_strtoll with >32bit ext4 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
-                printk("EXT4-fs: Invalid sb specification: %s\n",
+                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
                       (char *) *data);
                return 1;
        }
@@ -1071,7 +1085,7 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_orlov:
                        clear_opt(sbi->s_mount_opt, OLDALLOC);
                        break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
                case Opt_user_xattr:
                        set_opt(sbi->s_mount_opt, XATTR_USER);
                        break;
@@ -1081,10 +1095,11 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                        printk("EXT4 (no)user_xattr options not supported\n");
+                        printk(KERN_ERR "EXT4 (no)user_xattr options "
+                               "not supported\n");
                        break;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
                case Opt_acl:
                        set_opt(sbi->s_mount_opt, POSIX_ACL);
                        break;
@@ -1094,7 +1109,8 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_acl:
                case Opt_noacl:
-                        printk("EXT4 (no)acl options not supported\n");
+                        printk(KERN_ERR "EXT4 (no)acl options "
+                               "not supported\n");
                        break;
 #endif
                case Opt_reservation:
@@ -1177,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb,
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
+                case Opt_data_err_abort:
+                        set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        break;
+                case Opt_data_err_ignore:
+                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -1188,8 +1210,8 @@ set_qf_name:
                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
-                                        "EXT4-fs: Cannot change journaled "
+                                       "EXT4-fs: Cannot change journaled "
-                                        "quota options when quota turned on.\n");
+                                       "quota options when quota turned on.\n");
                                return 0;
                        }
                        qname = match_strdup(&args[0]);
@@ -1356,12 +1378,6 @@ set_qf_format:
                case Opt_nodelalloc:
                        clear_opt(sbi->s_mount_opt, DELALLOC);
                        break;
-                case Opt_mballoc:
-                        set_opt(sbi->s_mount_opt, MBALLOC);
-                        break;
-                case Opt_nomballoc:
-                        clear_opt(sbi->s_mount_opt, MBALLOC);
-                        break;
                case Opt_stripe:
                        if (match_int(&args[0], &option))
                                return 0;
@@ -1372,6 +1388,13 @@ set_qf_format:
                case Opt_delalloc:
                        set_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+                case Opt_inode_readahead_blks:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0 || option > (1 << 30))
+                                return 0;
+                        sbi->s_inode_readahead_blks = option;
+                        break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1472,15 +1495,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt);
-        printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
+        printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-        if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
+               sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
-                char b[BDEVNAME_SIZE];
+               "external", EXT4_SB(sb)->s_journal->j_devname);
-                printk("external journal on %s\n",
-                        bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
-        } else {
-                printk("internal journal\n");
-        }
        return res;
 }
@@ -1503,8 +1520,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-        flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+        /* We allocate both existing and potentially added groups */
-                groups_per_flex;
+        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+                            ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
+                              EXT4_DESC_PER_BLOCK_BITS(sb))) /
+                           groups_per_flex;
        sbi->s_flex_groups = kzalloc(flex_group_count *
                                     sizeof(struct flex_groups), GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
@@ -1583,7 +1603,7 @@ static int ext4_check_descriptors(struct super_block *sb)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                flexbg_flag = 1;
-        ext4_debug ("Checking group descriptors");
+        ext4_debug("Checking group descriptors");
        for (i = 0; i < sbi->s_groups_count; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1622,8 +1642,10 @@ static int ext4_check_descriptors(struct super_block *sb)
                               "Checksum for group %lu failed (%u!=%u)\n",
                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
                               gdp)), le16_to_cpu(gdp->bg_checksum));
-                        if (!(sb->s_flags & MS_RDONLY))
+                        if (!(sb->s_flags & MS_RDONLY)) {
+                                spin_unlock(sb_bgl_lock(sbi, i));
                                return 0;
+                        }
                }
                spin_unlock(sb_bgl_lock(sbi, i));
                if (!flexbg_flag)
@@ -1713,9 +1735,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                DQUOT_INIT(inode);
                if (inode->i_nlink) {
                        printk(KERN_DEBUG
-                                "%s: truncating inode %lu to %Ld bytes\n",
+                                "%s: truncating inode %lu to %lld bytes\n",
                                __func__, inode->i_ino, inode->i_size);
-                        jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        ext4_truncate(inode);
                        nr_truncates++;
@@ -1913,6 +1935,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
+        char *cp;
        int ret = -EINVAL;
        int blocksize;
        int db_count;
@@ -1929,10 +1952,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT4_DEF_RESUID;
        sbi->s_resgid = EXT4_DEF_RESGID;
+        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
        unlock_kernel();
+        /* Cleanup superblock name */
+        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+                *cp = '!';
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1972,11 +2000,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                set_opt(sbi->s_mount_opt, GRPID);
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
                set_opt(sbi->s_mount_opt, XATTR_USER);
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (def_mount_opts & EXT4_DEFM_ACL)
                set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
@@ -2011,11 +2039,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                ext4_warning(sb, __func__,
                        "extents feature not enabled on this filesystem, "
                        "use tune2fs.\n");
-        /*
-         * turn on mballoc code by default in ext4 filesystem
-         * Use -o nomballoc to turn it off
-         */
-        set_opt(sbi->s_mount_opt, MBALLOC);
        /*
         * enable delayed allocation by default
@@ -2040,16 +2063,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "running e2fsck is recommended\n");
        /*
-         * Since ext4 is still considered development code, we require
-         * that the TEST_FILESYS flag in s->flags be set.
-         */
-        if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
-                printk(KERN_WARNING "EXT4-fs: %s: not marked "
-                       "OK to use with test code.\n", sb->s_id);
-                goto failed_mount;
-        }
-        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
         * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2218,6 +2231,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+#ifdef CONFIG_PROC_FS
+        if (ext4_proc_root)
+                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+        if (sbi->s_proc)
+                proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+                                 &ext4_ui_proc_fops,
+                                 &sbi->s_inode_readahead_blks);
+#endif
        bgl_lock_init(&sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
@@ -2256,24 +2279,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                err = percpu_counter_init(&sbi->s_dirs_counter,
                                ext4_count_dirs(sb));
        }
+        if (!err) {
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        }
        if (err) {
                printk(KERN_ERR "EXT4-fs: insufficient memory\n");
                goto failed_mount3;
        }
-        /* per fileystem reservation list head & lock */
-        spin_lock_init(&sbi->s_rsv_window_lock);
-        sbi->s_rsv_window_root = RB_ROOT;
-        /* Add a single, static dummy reservation to the start of the
-         * reservation window list --- it gives us a placeholder for
-         * append-at-start-of-list which makes the allocation logic
-         * _much_ simpler. */
-        sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-        sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-        sbi->s_rsv_window_head.rsv_alloc_hit = 0;
-        sbi->s_rsv_window_head.rsv_goal_size = 0;
-        ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        /*
@@ -2470,7 +2483,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
        ext4_ext_init(sb);
-        ext4_mb_init(sb, needs_recovery);
+        err = ext4_mb_init(sb, needs_recovery);
+        if (err) {
+                printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+                       err);
+                goto failed_mount4;
+        }
        lock_kernel();
        return 0;
@@ -2488,11 +2506,16 @@ failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
 failed_mount:
+        if (sbi->s_proc) {
+                remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+                remove_proc_entry(sb->s_id, ext4_proc_root);
+        }
 #ifdef CONFIG_QUOTA
        for (i = 0; i < MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
@@ -2526,6 +2549,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
+        if (test_opt(sb, DATA_ERR_ABORT))
+                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+        else
+                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
        spin_unlock(&journal->j_state_lock);
 }
@@ -2551,7 +2578,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
                return NULL;
        }
-        jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode)) {
                printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2714,6 +2741,11 @@ static int ext4_load_journal(struct super_block *sb,
                        return -EINVAL;
        }
+        if (journal->j_flags & JBD2_BARRIER)
+                printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+        else
+                printk(KERN_INFO "EXT4-fs: barriers disabled\n");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = jbd2_journal_update_format(journal);
                if (err)  {
@@ -2798,13 +2830,34 @@ static void ext4_commit_super(struct super_block *sb,
        if (!sbh)
                return;
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "ext4: previous I/O error to "
+                       "superblock detected for %s.\n", sb->s_id);
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
        es->s_wtime = cpu_to_le32(get_seconds());
        ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
-        if (sync)
+        if (sync) {
                sync_dirty_buffer(sbh);
+                if (buffer_write_io_error(sbh)) {
+                        printk(KERN_ERR "ext4: I/O error while writing "
+                               "superblock for %s.\n", sb->s_id);
+                        clear_buffer_write_io_error(sbh);
+                        set_buffer_uptodate(sbh);
+                }
+        }
 }
@@ -2819,7 +2872,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
        journal_t *journal = EXT4_SB(sb)->s_journal;
        jbd2_journal_lock_updates(journal);
-        jbd2_journal_flush(journal);
+        if (jbd2_journal_flush(journal) < 0)
+                goto out;
        lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
@@ -2828,6 +2883,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
                ext4_commit_super(sb, es, 1);
        }
        unlock_super(sb);
+out:
        jbd2_journal_unlock_updates(journal);
 }
@@ -2906,6 +2963,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
        tid_t target;
+        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
                if (wait)
@@ -2927,7 +2985,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
                /* Now we set up the journal barrier. */
                jbd2_journal_lock_updates(journal);
-                jbd2_journal_flush(journal);
+                /*
+                 * We don't want to clear needs_recovery flag when we failed
+                 * to flush the journal.
+                 */
+                if (jbd2_journal_flush(journal) < 0)
+                        return;
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -3161,7 +3225,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+        buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+                       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
        ext4_free_blocks_count_set(es, buf->f_bfree);
        buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
        if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3366,8 +3431,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                 * otherwise be livelocked...
                 */
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-                jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                if (err) {
+                        path_put(&nd.path);
+                        return err;
+                }
        }
        err = vfs_quota_on_path(sb, type, format_id, &nd.path);
@@ -3431,7 +3500,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        handle_t *handle = journal_current_handle();
        if (!handle) {
-                printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
+                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started.\n",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
@@ -3492,18 +3561,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
+#ifdef CONFIG_PROC_FS
+static int ext4_ui_proc_show(struct seq_file *m, void *v)
+{
+        unsigned int *p = m->private;
+        seq_printf(m, "%u\n", *p);
+        return 0;
+}
+static int ext4_ui_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
+}
+static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
+                               size_t cnt, loff_t *ppos)
+{
+        unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+        char str[32];
+        unsigned long value;
+        if (cnt >= sizeof(str))
+                return -EINVAL;
+        if (copy_from_user(str, buf, cnt))
+                return -EFAULT;
+        value = simple_strtol(str, NULL, 0);
+        if (value < 0)
+                return -ERANGE;
+        *p = value;
+        return cnt;
+}
+const struct file_operations ext4_ui_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = ext4_ui_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = ext4_ui_proc_write,
+};
+#endif
+static struct file_system_type ext4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "ext4",
+        .get_sb         = ext4_get_sb,
+        .kill_sb        = kill_block_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+#ifdef CONFIG_EXT4DEV_COMPAT
+static int ext4dev_get_sb(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+               "to mount using ext4\n");
+        printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+               "will go away by 2.6.31\n");
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
 static struct file_system_type ext4dev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4dev",
-        .get_sb         = ext4_get_sb,
+        .get_sb         = ext4dev_get_sb,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
+MODULE_ALIAS("ext4dev");
+#endif
 static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
                return err;
@@ -3514,9 +3647,16 @@ static int __init init_ext4_fs(void)
        err = init_inodecache();
        if (err)
                goto out1;
-        err = register_filesystem(&ext4dev_fs_type);
+        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;
+#ifdef CONFIG_EXT4DEV_COMPAT
+        err = register_filesystem(&ext4dev_fs_type);
+        if (err) {
+                unregister_filesystem(&ext4_fs_type);
+                goto out;
+        }
+#endif
        return 0;
 out:
        destroy_inodecache();
@@ -3529,10 +3669,14 @@ out2:
 static void __exit exit_ext4_fs(void)
 {
+        unregister_filesystem(&ext4_fs_type);
+#ifdef CONFIG_EXT4DEV_COMPAT
        unregister_filesystem(&ext4dev_fs_type);
+#endif
        destroy_inodecache();
        exit_ext4_xattr();
        exit_ext4_mballoc();
+        remove_proc_entry("fs/ext4", NULL);
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc01..00740cb32be3 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
 #include "ext4.h"
 #include "xattr.h"
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
-        nd_set_link(nd, (char*)ei->i_data);
+        nd_set_link(nd, (char *) ei->i_data);
        return NULL;
 }
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
        .listxattr      = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8954208b4893..80626d516fee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
 static struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
 #endif
        [EXT4_XATTR_INDEX_TRUSTED]           = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
        [EXT4_XATTR_INDEX_SECURITY]          = &ext4_xattr_security_handler,
 #endif
 };
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 struct xattr_handler *ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
        &ext4_xattr_acl_access_handler,
        &ext4_xattr_acl_default_handler,
 #endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
        &ext4_xattr_security_handler,
 #endif
        NULL
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        struct ext4_xattr_block_find bs = {
                .s = { .not_found = -ENODATA, },
        };
+        unsigned long no_expand;
        int error;
        if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
        if (strlen(name) > 255)
                return -ERANGE;
        down_write(&EXT4_I(inode)->xattr_sem);
+        no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+        EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 cleanup:
        brelse(is.iloc.bh);
        brelse(bs.bh);
+        if (no_expand == 0)
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
        up_write(&EXT4_I(inode)->xattr_sem);
        return error;
 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb9..8ede88b18c29 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
        (((name_len) + EXT4_XATTR_ROUND + \
        sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
 #define EXT4_XATTR_NEXT(entry) \
-        ( (struct ext4_xattr_entry *)( \
+        ((struct ext4_xattr_entry *)( \
-          (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+         (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
 #define EXT4_XATTR_SIZE(size) \
        (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
                EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+# ifdef CONFIG_EXT4_FS_XATTR
 extern struct xattr_handler ext4_xattr_user_handler;
 extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
 extern struct xattr_handler *ext4_xattr_handlers[];
-# else  /* CONFIG_EXT4DEV_FS_XATTR */
+# else  /* CONFIG_EXT4_FS_XATTR */
 static inline int
 ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 #define ext4_xattr_handlers     NULL
-# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+# endif  /* CONFIG_EXT4_FS_XATTR */
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
                                struct inode *dir);
 #else
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 struct fatent_operations {
        void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
        struct fat_entry fatent;
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        int i, err, nr_bhs;
+        int first_cl = cluster;
        nr_bhs = 0;
        fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        goto error;
                }
+                /* 
+                 * Issue discard for the sectors we no longer care about,
+                 * batching contiguous clusters into one request
+                 */
+                if (cluster != fatent.entry + 1) {
+                        int nr_clus = fatent.entry - first_cl + 1;
+                        sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+                                         nr_clus * sbi->sec_per_clus);
+                        first_cl = cluster;
+                }
                ops->ent_put(&fatent, FAT_ENT_FREE);
                if (sbi->free_clusters != -1) {
                        sbi->free_clusters++;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6d266d793e2c..d12cdf2a0406 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -562,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)
        struct buffer_head *bh;
        struct msdos_dir_entry *raw_entry;
        loff_t i_pos;
-        int err = 0;
+        int err;
 retry:
        i_pos = MSDOS_I(inode)->i_pos;
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
                       "for updating (i_pos %lld)\n", i_pos);
-                err = -EIO;
+                return -EIO;
-                goto out;
        }
        spin_lock(&sbi->inode_hash_lock);
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_super(sb);
                goto retry;
        }
@@ -607,11 +604,10 @@ retry:
        }
        spin_unlock(&sbi->inode_hash_lock);
        mark_buffer_dirty(bh);
+        err = 0;
        if (wait)
                err = sync_dirty_buffer(bh);
        brelse(bh);
-out:
-        unlock_super(sb);
        return err;
 }
@@ -859,7 +855,7 @@ enum {
        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
 };
-static match_table_t fat_tokens = {
+static const match_table_t fat_tokens = {
        {Opt_check_r, "check=relaxed"},
        {Opt_check_s, "check=strict"},
        {Opt_check_n, "check=normal"},
@@ -894,14 +890,14 @@ static match_table_t fat_tokens = {
        {Opt_tz_utc, "tz=UTC"},
        {Opt_err, NULL},
 };
-static match_table_t msdos_tokens = {
+static const match_table_t msdos_tokens = {
        {Opt_nodots, "nodots"},
        {Opt_nodots, "dotsOK=no"},
        {Opt_dots, "dots"},
        {Opt_dots, "dotsOK=yes"},
        {Opt_err, NULL}
 };
-static match_table_t vfat_tokens = {
+static const match_table_t vfat_tokens = {
        {Opt_charset, "iocharset=%s"},
        {Opt_shortname_lower, "shortname=lower"},
        {Opt_shortname_win95, "shortname=win95"},
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d2249f174e20..6a84388cacff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -354,7 +354,7 @@ enum {
        OPT_ERR
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {OPT_FD,                        "fd=%u"},
        {OPT_ROOTMODE,                  "rootmode=%o"},
        {OPT_USER_ID,                   "user_id=%u"},
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e546616..c962283d4e7f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
        holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
        if (time_before(now, holdtime))
                delay = holdtime - now;
+        if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                delay = gl->gl_ops->go_min_hold_time;
        spin_lock(&gl->gl_spin);
        handle_callback(gl, state, 1, delay);
@@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
                *p++ = 'a';
        if (flags & GL_EXACT)
                *p++ = 'E';
-        if (flags & GL_ATIME)
-                *p++ = 'a';
        if (flags & GL_NOCACHE)
                *p++ = 'c';
        if (test_bit(HIF_HOLDER, &iflags))
@@ -1816,15 +1816,17 @@ restart:
        if (gl) {
                gi->gl = hlist_entry(gl->gl_list.next,
                                     struct gfs2_glock, gl_list);
-                if (gi->gl)
+        } else {
-                        gfs2_glock_hold(gi->gl);
+                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+                                     struct gfs2_glock, gl_list);
        }
+        if (gi->gl)
+                gfs2_glock_hold(gi->gl);
        read_unlock(gl_lock_addr(gi->hash));
        if (gl)
                gfs2_glock_put(gl);
-        if (gl && gi->gl == NULL)
-                gi->hash++;
        while (gi->gl == NULL) {
+                gi->hash++;
                if (gi->hash >= GFS2_GL_HASH_SIZE)
                        return 1;
                read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
                if (gi->gl)
                        gfs2_glock_hold(gi->gl);
                read_unlock(gl_lock_addr(gi->hash));
-                gi->hash++;
        }
        if (gi->sdp != gi->gl->gl_sbd)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70fc..695c6b193611 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
 #define GL_ASYNC                0x00000040
 #define GL_EXACT                0x00000080
 #define GL_SKIP                 0x00000100
-#define GL_ATIME                0x00000200
 #define GL_NOCACHE              0x00000400
 #define GLR_TRYFAILED           13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c462..f566ec1b4e8e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,20 +386,21 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_ORDERED       2
 struct gfs2_args {
-        char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+        char ar_lockproto[GFS2_LOCKNAME_LEN];   /* Name of the Lock Protocol */
-        char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+        char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
-        char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+        char ar_hostdata[GFS2_LOCKNAME_LEN];    /* Host specific data */
-        int ar_spectator; /* Don't get a journal because we're always RO */
+        unsigned int ar_spectator:1;            /* Don't get a journal */
-        int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
+        unsigned int ar_ignore_local_fs:1;      /* Ignore optimisations */
-        int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
+        unsigned int ar_localflocks:1;          /* Let the VFS do flock|fcntl */
-        int ar_localcaching; /* Local-style caching (dangerous on multihost) */
+        unsigned int ar_localcaching:1;         /* Local caching */
-        int ar_debug; /* Oops on errors instead of trying to be graceful */
+        unsigned int ar_debug:1;                /* Oops on errors */
-        int ar_upgrade; /* Upgrade ondisk/multihost format */
+        unsigned int ar_upgrade:1;              /* Upgrade ondisk format */
-        unsigned int ar_num_glockd; /* Number of glockd threads */
+        unsigned int ar_posix_acl:1;            /* Enable posix acls */
-        int ar_posix_acl; /* Enable posix acls */
+        unsigned int ar_quota:2;                /* off/account/on */
-        int ar_quota; /* off/account/on */
+        unsigned int ar_suiddir:1;              /* suiddir support */
-        int ar_suiddir; /* suiddir support */
+        unsigned int ar_data:2;                 /* ordered/writeback */
-        int ar_data; /* ordered/writeback */
+        unsigned int ar_meta:1;                 /* mount metafs */
+        unsigned int ar_num_glockd;             /* Number of glockd threads */
 };
 struct gfs2_tune {
@@ -419,7 +420,6 @@ struct gfs2_tune {
        unsigned int gt_quota_scale_den; /* Denominator */
        unsigned int gt_quota_cache_secs;
        unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
-        unsigned int gt_atime_quantum; /* Min secs between atime updates */
        unsigned int gt_new_files_jdata;
        unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
        unsigned int gt_stall_secs; /* Detects trouble! */
@@ -432,7 +432,7 @@ enum {
        SDF_JOURNAL_CHECKED     = 0,
        SDF_JOURNAL_LIVE        = 1,
        SDF_SHUTDOWN            = 2,
-        SDF_NOATIME             = 3,
+        SDF_NOBARRIERS          = 3,
 };
 #define GFS2_FSNAME_LEN         256
@@ -461,7 +461,6 @@ struct gfs2_sb_host {
 struct gfs2_sbd {
        struct super_block *sd_vfs;
-        struct super_block *sd_vfs_meta;
        struct kobject sd_kobj;
        unsigned long sd_flags; /* SDF_... */
        struct gfs2_sb_host sd_sb;
@@ -499,7 +498,9 @@ struct gfs2_sbd {
        /* Inode Stuff */
-        struct inode *sd_master_dir;
+        struct dentry *sd_master_dir;
+        struct dentry *sd_root_dir;
        struct inode *sd_jindex;
        struct inode *sd_inum_inode;
        struct inode *sd_statfs_inode;
@@ -634,7 +635,6 @@ struct gfs2_sbd {
        /* Debugging crud */
        unsigned long sd_last_warning;
-        struct vfsmount *sd_gfs2mnt;
        struct dentry *debugfs_dir;    /* debugfs directory */
        struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8b0806a32948..7cee695fa441 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/security.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
+        struct timespec atime;
        u16 height, depth;
        if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        di->di_size = be64_to_cpu(str->di_size);
        i_size_write(&ip->i_inode, di->di_size);
        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-        ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
+        atime.tv_sec = be64_to_cpu(str->di_atime);
-        ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+        if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+                ip->i_inode.i_atime = atime;
        ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
        ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
        ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (bh)
                brelse(bh);
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
        return inode;
 fail_gunlock2:
        gfs2_glock_dq_uninit(ghs + 1);
-        if (inode)
+        if (inode && !IS_ERR(inode))
                iput(inode);
 fail_gunlock:
        gfs2_glock_dq(ghs);
@@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
        return 0;
 }
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
-        struct inode *dir = &to->i_inode;
-        struct super_block *sb = dir->i_sb;
-        struct inode *tmp;
-        struct qstr dotdot;
-        int error = 0;
-        gfs2_str2qstr(&dotdot, "..");
-        igrab(dir);
-        for (;;) {
-                if (dir == &this->i_inode) {
-                        error = -EINVAL;
-                        break;
-                }
-                if (dir == sb->s_root->d_inode) {
-                        error = 0;
-                        break;
-                }
-                tmp = gfs2_lookupi(dir, &dotdot, 1);
-                if (IS_ERR(tmp)) {
-                        error = PTR_ERR(tmp);
-                        break;
-                }
-                iput(dir);
-                dir = tmp;
-        }
-        iput(dir);
-        return error;
-}
 /**
 * gfs2_readlinki - return the contents of a symlink
 * @ip: the symlink's inode
@@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        unsigned int x;
        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq_atime(&i_gh);
+        error = gfs2_glock_nq(&i_gh);
        if (error) {
                gfs2_holder_uninit(&i_gh);
                return error;
@@ -1243,101 +1197,6 @@ out:
        return error;
 }
-/**
- * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
- *       conditionally update the inode's atime
- * @gh: the holder to acquire
- *
- * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
- * Update if the difference between the current time and the inode's current
- * atime is greater than an interval specified at mount.
- *
- * Returns: errno
- */
-int gfs2_glock_nq_atime(struct gfs2_holder *gh)
-{
-        struct gfs2_glock *gl = gh->gh_gl;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct gfs2_inode *ip = gl->gl_object;
-        s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
-        unsigned int state;
-        int flags;
-        int error;
-        struct timespec tv = CURRENT_TIME;
-        if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
-            gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
-            gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
-                return -EINVAL;
-        state = gh->gh_state;
-        flags = gh->gh_flags;
-        error = gfs2_glock_nq(gh);
-        if (error)
-                return error;
-        if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
-            (sdp->sd_vfs->s_flags & MS_RDONLY))
-                return 0;
-        if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-                gfs2_glock_dq(gh);
-                gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
-                                   gh);
-                error = gfs2_glock_nq(gh);
-                if (error)
-                        return error;
-                /* Verify that atime hasn't been updated while we were
-                   trying to get exclusive lock. */
-                tv = CURRENT_TIME;
-                if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-                        struct buffer_head *dibh;
-                        struct gfs2_dinode *di;
-                        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-                        if (error == -EROFS)
-                                return 0;
-                        if (error)
-                                goto fail;
-                        error = gfs2_meta_inode_buffer(ip, &dibh);
-                        if (error)
-                                goto fail_end_trans;
-                        ip->i_inode.i_atime = tv;
-                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                        di = (struct gfs2_dinode *)dibh->b_data;
-                        di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
-                        di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
-                        brelse(dibh);
-                        gfs2_trans_end(sdp);
-                }
-                /* If someone else has asked for the glock,
-                   unlock and let them have it. Then reacquire
-                   in the original state. */
-                if (gfs2_glock_is_blocking(gl)) {
-                        gfs2_glock_dq(gh);
-                        gfs2_holder_reinit(state, flags, gh);
-                        return gfs2_glock_nq(gh);
-                }
-        }
-        return 0;
-fail_end_trans:
-        gfs2_trans_end(sdp);
-fail:
-        gfs2_glock_dq(gh);
-        return error;
-}
 static int
 __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 58f9607d6a86..2d43f69610a0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
                   const struct gfs2_inode *ip);
 int gfs2_permission(struct inode *inode, int mask);
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_glock_nq_atime(struct gfs2_holder *gh);
 int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 09d78c216f48..0c4cbe6c8285 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
        error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
                                  &ls->dlm_lockspace,
-                                  DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0),
+                                  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+                                  (nodir ? DLM_LSFL_NODIR : 0),
                                  GDLM_LVB_SIZE);
        if (error) {
                log_error("dlm_new_lockspace error %d", error);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3ab..ad305854bdc6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/bio.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        memset(bh->b_data, 0, bh->b_size);
        set_buffer_uptodate(bh);
        clear_buffer_dirty(bh);
-        unlock_buffer(bh);
        gfs2_ail1_empty(sdp, 0);
        tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
        lh->lh_hash = cpu_to_be32(hash);
-        set_buffer_dirty(bh);
+        bh->b_end_io = end_buffer_write_sync;
-        if (sync_dirty_buffer(bh))
+        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+                goto skip_barrier;
+        get_bh(bh);
+        submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+        wait_on_buffer(bh);
+        if (buffer_eopnotsupp(bh)) {
+                clear_buffer_eopnotsupp(bh);
+                set_buffer_uptodate(bh);
+                set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+                lock_buffer(bh);
+skip_barrier:
+                get_bh(bh);
+                submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+                wait_on_buffer(bh);
+        }
+        if (!buffer_uptodate(bh))
                gfs2_io_error_bh(sdp, bh);
        brelse(bh);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f958..f96eb90a2cfa 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,10 +42,11 @@ enum {
        Opt_nosuiddir,
        Opt_data_writeback,
        Opt_data_ordered,
+        Opt_meta,
        Opt_err,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_lockproto, "lockproto=%s"},
        {Opt_locktable, "locktable=%s"},
        {Opt_hostdata, "hostdata=%s"},
@@ -66,6 +67,7 @@ static match_table_t tokens = {
        {Opt_nosuiddir, "nosuiddir"},
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_ordered, "data=ordered"},
+        {Opt_meta, "meta"},
        {Opt_err, NULL}
 };
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
                case Opt_data_ordered:
                        args->ar_data = GFS2_DATA_ORDERED;
                        break;
+                case Opt_meta:
+                        if (remount && args->ar_meta != 1)
+                                goto cant_remount;
+                        args->ar_meta = 1;
+                        break;
                case Opt_err:
                default:
                        fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117a..27563816e1c5 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
        int error;
        unlock_page(page);
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        error = gfs2_glock_nq_atime(&gh);
+        error = gfs2_glock_nq(&gh);
        if (unlikely(error))
                goto out;
        error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
        struct gfs2_holder gh;
        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        ret = gfs2_glock_nq_atime(&gh);
+        ret = gfs2_glock_nq(&gh);
        if (unlikely(ret))
                goto out_uninit;
        if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        unsigned to = from + len;
        struct page *page;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-        error = gfs2_glock_nq_atime(&ip->i_gh);
+        error = gfs2_glock_nq(&ip->i_gh);
        if (unlikely(error))
                goto out_uninit;
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
        if (gfs2_is_stuffed(ip))
                return 0;
-        if (offset > i_size_read(&ip->i_inode))
+        if (offset >= i_size_read(&ip->i_inode))
                return 0;
        return 1;
 }
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
         * unfortunately have the option of only flushing a range like
         * the VFS does.
         */
-        gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
-        rv = gfs2_glock_nq_atime(&gh);
+        rv = gfs2_glock_nq(&gh);
        if (rv)
                return rv;
        rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411c..3a747f8e2188 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
        u64 offset = file->f_pos;
        int error;
-        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
+        gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-        error = gfs2_glock_nq_atime(&d_gh);
+        error = gfs2_glock_nq(&d_gh);
        if (error) {
                gfs2_holder_uninit(&d_gh);
                return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
        int error;
        u32 fsflags;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-        error = gfs2_glock_nq_atime(&gh);
+        error = gfs2_glock_nq(&gh);
        if (error)
                return error;
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        struct gfs2_alloc *al;
        int ret;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        ret = gfs2_glock_nq_atime(&gh);
+        ret = gfs2_glock_nq(&gh);
        if (ret)
                goto out;
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
        struct gfs2_holder i_gh;
        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq_atime(&i_gh);
+        error = gfs2_glock_nq(&i_gh);
        if (error) {
                gfs2_holder_uninit(&i_gh);
                return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d6490633..b117fcf2c4f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
 #define DO 0
 #define UNDO 1
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+        spin_lock_init(&gt->gt_spin);
+        gt->gt_demote_secs = 300;
+        gt->gt_incore_log_blocks = 1024;
+        gt->gt_log_flush_secs = 60;
+        gt->gt_recoverd_secs = 60;
+        gt->gt_logd_secs = 1;
+        gt->gt_quotad_secs = 5;
+        gt->gt_quota_simul_sync = 64;
+        gt->gt_quota_warn_period = 10;
+        gt->gt_quota_scale_num = 1;
+        gt->gt_quota_scale_den = 1;
+        gt->gt_quota_cache_secs = 300;
+        gt->gt_quota_quantum = 60;
+        gt->gt_new_files_jdata = 0;
+        gt->gt_max_readahead = 1 << 18;
+        gt->gt_stall_secs = 600;
+        gt->gt_complain_secs = 10;
+        gt->gt_statfs_quantum = 30;
+        gt->gt_statfs_slow = 0;
+}
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
        struct gfs2_sbd *sdp;
@@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        return sdp;
 }
-static void init_vfs(struct super_block *sb, unsigned noatime)
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-        struct gfs2_sbd *sdp = sb->s_fs_info;
+        unsigned int x;
-        sb->s_magic = GFS2_MAGIC;
+        if (sb->sb_magic != GFS2_MAGIC ||
-        sb->s_op = &gfs2_super_ops;
+            sb->sb_type != GFS2_METATYPE_SB) {
-        sb->s_export_op = &gfs2_export_ops;
+                if (!silent)
-        sb->s_time_gran = 1;
+                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
-        sb->s_maxbytes = MAX_LFS_FILESIZE;
+                return -EINVAL;
+        }
+        /*  If format numbers match exactly, we're done.  */
+        if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+                return 0;
+        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+                for (x = 0; gfs2_old_fs_formats[x]; x++)
+                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+                                break;
+                if (!gfs2_old_fs_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+                for (x = 0; gfs2_old_multihost_formats[x]; x++)
+                        if (gfs2_old_multihost_formats[x] ==
+                            sb->sb_multihost_format)
+                                break;
+                if (!gfs2_old_multihost_formats[x]) {
+                        printk(KERN_WARNING
+                               "GFS2: code version (%u, %u) is incompatible "
+                               "with ondisk format (%u, %u)\n",
+                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                               sb->sb_fs_format, sb->sb_multihost_format);
+                        printk(KERN_WARNING
+                               "GFS2: I don't know how to upgrade this FS\n");
+                        return -EINVAL;
+                }
+        }
+        if (!sdp->sd_args.ar_upgrade) {
+                printk(KERN_WARNING
+                       "GFS2: code version (%u, %u) is incompatible "
+                       "with ondisk format (%u, %u)\n",
+                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+                       sb->sb_fs_format, sb->sb_multihost_format);
+                printk(KERN_INFO
+                       "GFS2: Use the \"upgrade\" mount option to upgrade "
+                       "the FS\n");
+                printk(KERN_INFO "GFS2: See the manual for more details\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static void end_bio_io_page(struct bio *bio, int error)
+{
+        struct page *page = bio->bi_private;
-        if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
+        if (!error)
-                set_bit(noatime, &sdp->sd_flags);
+                SetPageUptodate(page);
+        else
+                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+        unlock_page(page);
+}
+static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+{
+        const struct gfs2_sb *str = buf;
+        sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+        sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+        sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+        sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+        sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+        sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+        sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+        sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+        sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+        sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+        sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+{
+        struct super_block *sb = sdp->sd_vfs;
+        struct gfs2_sb *p;
+        struct page *page;
+        struct bio *bio;
+        page = alloc_page(GFP_NOFS);
+        if (unlikely(!page))
+                return -ENOBUFS;
+        ClearPageUptodate(page);
+        ClearPageDirty(page);
+        lock_page(page);
+        bio = bio_alloc(GFP_NOFS, 1);
+        if (unlikely(!bio)) {
+                __free_page(page);
+                return -ENOBUFS;
+        }
-        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
+        bio->bi_sector = sector * (sb->s_blocksize >> 9);
-        sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+        bio->bi_bdev = sb->s_bdev;
+        bio_add_page(bio, page, PAGE_SIZE, 0);
+        bio->bi_end_io = end_bio_io_page;
+        bio->bi_private = page;
+        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+        wait_on_page_locked(page);
+        bio_put(bio);
+        if (!PageUptodate(page)) {
+                __free_page(page);
+                return -EIO;
+        }
+        p = kmap(page);
+        gfs2_sb_in(&sdp->sd_sb, p);
+        kunmap(page);
+        __free_page(page);
+        return 0;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+        u32 hash_blocks, ind_blocks, leaf_blocks;
+        u32 tmp_blocks;
+        unsigned int x;
+        int error;
+        error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+        if (error) {
+                if (!silent)
+                        fs_err(sdp, "can't read superblock\n");
+                return error;
+        }
+        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+        if (error)
+                return error;
+        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_dinode)) / sizeof(u64);
+        sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+                          sizeof(struct gfs2_meta_header)) / sizeof(u64);
+        sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+        sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+        sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+        sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_meta_header)) /
+                                sizeof(struct gfs2_quota_change);
+        /* Compute maximum reservation required to add a entry to a directory */
+        hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+                             sdp->sd_jbsize);
+        ind_blocks = 0;
+        for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+                tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+                ind_blocks += tmp_blocks;
+        }
+        leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+        sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+        sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+                                sizeof(struct gfs2_dinode);
+        sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_heightsize[x - 1] || m)
+                        break;
+                sdp->sd_heightsize[x] = space;
+        }
+        sdp->sd_max_height = x;
+        sdp->sd_heightsize[x] = ~0;
+        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+                                 sizeof(struct gfs2_dinode);
+        sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+        for (x = 2;; x++) {
+                u64 space, d;
+                u32 m;
+                space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+                d = space;
+                m = do_div(d, sdp->sd_inptrs);
+                if (d != sdp->sd_jheightsize[x - 1] || m)
+                        break;
+                sdp->sd_jheightsize[x] = space;
+        }
+        sdp->sd_max_jheight = x;
+        sdp->sd_jheightsize[x] = ~0;
+        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+        return 0;
 }
 static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -224,51 +512,59 @@ fail:
        return error;
 }
-static inline struct inode *gfs2_lookup_root(struct super_block *sb,
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
-                                             u64 no_addr)
+                            u64 no_addr, const char *name)
 {
-        return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct dentry *dentry;
+        struct inode *inode;
+        inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+        if (IS_ERR(inode)) {
+                fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+                return PTR_ERR(inode);
+        }
+        dentry = d_alloc_root(inode);
+        if (!dentry) {
+                fs_err(sdp, "can't alloc %s dentry\n", name);
+                iput(inode);
+                return -ENOMEM;
+        }
+        dentry->d_op = &gfs2_dops;
+        *dptr = dentry;
+        return 0;
 }
-static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+static int init_sb(struct gfs2_sbd *sdp, int silent)
 {
        struct super_block *sb = sdp->sd_vfs;
        struct gfs2_holder sb_gh;
        u64 no_addr;
-        struct inode *inode;
+        int ret;
-        int error = 0;
-        if (undo) {
+        ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
-                if (sb->s_root) {
+                                LM_ST_SHARED, 0, &sb_gh);
-                        dput(sb->s_root);
+        if (ret) {
-                        sb->s_root = NULL;
+                fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
-                }
+                return ret;
-                return 0;
        }
-        error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+        ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
-                                 LM_ST_SHARED, 0, &sb_gh);
+        if (ret) {
-        if (error) {
+                fs_err(sdp, "can't read superblock: %d\n", ret);
-                fs_err(sdp, "can't acquire superblock glock: %d\n", error);
-                return error;
-        }
-        error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
-        if (error) {
-                fs_err(sdp, "can't read superblock: %d\n", error);
                goto out;
        }
        /* Set up the buffer cache and SB for real */
        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
-                error = -EINVAL;
+                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too small for device "
                       "block size (%u)\n",
                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
                goto out;
        }
        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
-                error = -EINVAL;
+                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too big for machine "
                       "page size (%u)\n",
                       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
        /* Get the root inode */
        no_addr = sdp->sd_sb.sb_root_dir.no_addr;
-        if (sb->s_type == &gfs2meta_fs_type)
+        ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
-                no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+        if (ret)
-        inode = gfs2_lookup_root(sb, no_addr);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                fs_err(sdp, "can't read in root inode: %d\n", error);
                goto out;
-        }
-        sb->s_root = d_alloc_root(inode);
+        /* Get the master inode */
-        if (!sb->s_root) {
+        no_addr = sdp->sd_sb.sb_master_dir.no_addr;
-                fs_err(sdp, "can't get root dentry\n");
+        ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
-                error = -ENOMEM;
+        if (ret) {
-                iput(inode);
+                dput(sdp->sd_root_dir);
-        } else
+                goto out;
-                sb->s_root->d_op = &gfs2_dops;
+        }
-        
+        sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
 out:
        gfs2_glock_dq_uninit(&sb_gh);
-        return error;
+        return ret;
 }
 /**
@@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
+        struct inode *master = sdp->sd_master_dir->d_inode;
        struct gfs2_holder ji_gh;
        struct task_struct *p;
        struct gfs2_inode *ip;
@@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                goto fail_recoverd;
        }
-        sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+        sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
        if (IS_ERR(sdp->sd_jindex)) {
                fs_err(sdp, "can't lookup journal index: %d\n", error);
                return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
        int error = 0;
        struct gfs2_inode *ip;
-        struct inode *inode;
+        struct inode *master = sdp->sd_master_dir->d_inode;
        if (undo)
                goto fail_qinode;
-        inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
-        if (IS_ERR(inode)) {
-                error = PTR_ERR(inode);
-                fs_err(sdp, "can't read in master directory: %d\n", error);
-                goto fail;
-        }
-        sdp->sd_master_dir = inode;
        error = init_journal(sdp, undo);
        if (error)
-                goto fail_master;
+                goto fail;
        /* Read in the master inode number inode */
-        sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+        sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
        if (IS_ERR(sdp->sd_inum_inode)) {
                error = PTR_ERR(sdp->sd_inum_inode);
                fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        /* Read in the master statfs inode */
-        sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+        sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
        if (IS_ERR(sdp->sd_statfs_inode)) {
                error = PTR_ERR(sdp->sd_statfs_inode);
                fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        }
        /* Read in the resource index inode */
-        sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+        sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
        if (IS_ERR(sdp->sd_rindex)) {
                error = PTR_ERR(sdp->sd_rindex);
                fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
-        sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+        sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
        if (IS_ERR(sdp->sd_quota_inode)) {
                error = PTR_ERR(sdp->sd_quota_inode);
                fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +855,6 @@ fail_inum:
        iput(sdp->sd_inum_inode);
 fail_journal:
        init_journal(sdp, UNDO);
-fail_master:
-        iput(sdp->sd_master_dir);
 fail:
        return error;
 }
@@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
        char buf[30];
        int error = 0;
        struct gfs2_inode *ip;
+        struct inode *master = sdp->sd_master_dir->d_inode;
        if (sdp->sd_args.ar_spectator)
                return 0;
@@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
        if (undo)
                goto fail_qc_gh;
-        pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+        pn = gfs2_lookup_simple(master, "per_node");
        if (IS_ERR(pn)) {
                error = PTR_ERR(pn);
                fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        }
-        init_vfs(sb, SDF_NOATIME);
+        sb->s_magic = GFS2_MAGIC;
+        sb->s_op = &gfs2_super_ops;
+        sb->s_export_op = &gfs2_export_ops;
+        sb->s_time_gran = 1;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        /* Set up the buffer cache and fill in some fake block size values
           to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        if (error)
                goto fail_lm;
-        error = init_sb(sdp, silent, DO);
+        error = init_sb(sdp, silent);
        if (error)
                goto fail_locking;
@@ -869,7 +1156,11 @@ fail_per_node:
 fail_inodes:
        init_inodes(sdp, UNDO);
 fail_sb:
-        init_sb(sdp, 0, UNDO);
+        if (sdp->sd_root_dir)
+                dput(sdp->sd_root_dir);
+        if (sdp->sd_master_dir)
+                dput(sdp->sd_master_dir);
+        sb->s_root = NULL;
 fail_locking:
        init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
@@ -887,151 +1178,63 @@ fail:
 }
 static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        struct super_block *sb;
+        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
-        struct gfs2_sbd *sdp;
-        int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
-        if (error)
-                goto out;
-        sb = mnt->mnt_sb;
-        sdp = sb->s_fs_info;
-        sdp->sd_gfs2mnt = mnt;
-out:
-        return error;
 }
-static int fill_super_meta(struct super_block *sb, struct super_block *new,
+static struct super_block *get_gfs2_sb(const char *dev_name)
-                           void *data, int silent)
 {
-        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct super_block *sb;
-        struct inode *inode;
-        int error = 0;
-        new->s_fs_info = sdp;
-        sdp->sd_vfs_meta = sb;
-        init_vfs(new, SDF_NOATIME);
-        /* Get the master inode */
-        inode = igrab(sdp->sd_master_dir);
-        new->s_root = d_alloc_root(inode);
-        if (!new->s_root) {
-                fs_err(sdp, "can't get root dentry\n");
-                error = -ENOMEM;
-                iput(inode);
-        } else
-                new->s_root->d_op = &gfs2_dops;
-        return error;
-}
-static int set_bdev_super(struct super_block *s, void *data)
-{
-        s->s_bdev = data;
-        s->s_dev = s->s_bdev->bd_dev;
-        return 0;
-}
-static int test_bdev_super(struct super_block *s, void *data)
-{
-        return s->s_bdev == data;
-}
-static struct super_block* get_gfs2_sb(const char *dev_name)
-{
-        struct kstat stat;
        struct nameidata nd;
-        struct super_block *sb = NULL, *s;
        int error;
        error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
        if (error) {
-                printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
+                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
-                       dev_name);
+                       dev_name, error);
-                goto out;
+                return NULL;
-        }
-        error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-        list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
-                if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
-                    (S_ISDIR(stat.mode) &&
-                     s == nd.path.dentry->d_inode->i_sb)) {
-                        sb = s;
-                        goto free_nd;
-                }
        }
+        sb = nd.path.dentry->d_inode->i_sb;
-        printk(KERN_WARNING "GFS2: Unrecognized block device or "
+        if (sb && (sb->s_type == &gfs2_fs_type))
-               "mount point %s\n", dev_name);
+                atomic_inc(&sb->s_active);
+        else
-free_nd:
+                sb = NULL;
        path_put(&nd.path);
-out:
        return sb;
 }
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
                            const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        int error = 0;
+        struct super_block *sb = NULL;
-        struct super_block *sb = NULL, *new;
        struct gfs2_sbd *sdp;
        sb = get_gfs2_sb(dev_name);
        if (!sb) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                error = -ENOENT;
+                return -ENOENT;
-                goto error;
        }
        sdp = sb->s_fs_info;
-        if (sdp->sd_vfs_meta) {
+        mnt->mnt_sb = sb;
-                printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
+        mnt->mnt_root = dget(sdp->sd_master_dir);
-                error = -EBUSY;
+        return 0;
-                goto error;
-        }
-        down(&sb->s_bdev->bd_mount_sem);
-        new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
-        up(&sb->s_bdev->bd_mount_sem);
-        if (IS_ERR(new)) {
-                error = PTR_ERR(new);
-                goto error;
-        }
-        new->s_flags = flags;
-        strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
-        sb_set_blocksize(new, sb->s_blocksize);
-        error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
-        if (error) {
-                up_write(&new->s_umount);
-                deactivate_super(new);
-                goto error;
-        }
-        new->s_flags |= MS_ACTIVE;
-        /* Grab a reference to the gfs2 mount point */
-        atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
-        return simple_set_mnt(mnt, new);
-error:
-        return error;
 }
 static void gfs2_kill_sb(struct super_block *sb)
 {
-        if (sb->s_fs_info) {
+        struct gfs2_sbd *sdp = sb->s_fs_info;
-                gfs2_delete_debugfs_file(sb->s_fs_info);
+        if (sdp) {
-                gfs2_meta_syncfs(sb->s_fs_info);
+                gfs2_meta_syncfs(sdp);
+                dput(sdp->sd_root_dir);
+                dput(sdp->sd_master_dir);
+                sdp->sd_root_dir = NULL;
+                sdp->sd_master_dir = NULL;
        }
+        shrink_dcache_sb(sb);
        kill_block_super(sb);
-}
+        if (sdp)
+                gfs2_delete_debugfs_file(sdp);
-static void gfs2_kill_sb_meta(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        generic_shutdown_super(sb);
-        sdp->sd_vfs_meta = NULL;
-        atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
 }
 struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
        .name = "gfs2meta",
        .fs_flags = FS_REQUIRES_DEV,
        .get_sb = gfs2_get_sb_meta,
-        .kill_sb = gfs2_kill_sb_meta,
        .owner = THIS_MODULE,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e2c62f73a778..534e1e2c65ca 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-        error = gfs2_glock_nq_m(2, ghs);
+        error = gfs2_glock_nq(ghs); /* parent */
        if (error)
-                goto out;
+                goto out_parent;
+        error = gfs2_glock_nq(ghs + 1); /* child */
+        if (error)
+                goto out_child;
        error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
@@ -245,8 +249,10 @@ out_alloc:
        if (alloc_required)
                gfs2_alloc_put(dip);
 out_gunlock:
-        gfs2_glock_dq_m(2, ghs);
+        gfs2_glock_dq(ghs + 1);
-out:
+out_child:
+        gfs2_glock_dq(ghs);
+out_parent:
        gfs2_holder_uninit(ghs);
        gfs2_holder_uninit(ghs + 1);
        if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
        if (error)
-                goto out_rgrp;
+                goto out_gunlock;
        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
        if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 out_end_trans:
        gfs2_trans_end(sdp);
+out_gunlock:
        gfs2_glock_dq(ghs + 2);
 out_rgrp:
        gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        struct gfs2_holder ri_gh;
        int error;
        error = gfs2_rindex_hold(sdp, &ri_gh);
        if (error)
                return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
        gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-        error = gfs2_glock_nq_m(3, ghs);
+        error = gfs2_glock_nq(ghs); /* parent */
        if (error)
-                goto out;
+                goto out_parent;
+        error = gfs2_glock_nq(ghs + 1); /* child */
+        if (error)
+                goto out_child;
+        error = gfs2_glock_nq(ghs + 2); /* rgrp */
+        if (error)
+                goto out_rgrp;
        error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
        if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        gfs2_trans_end(sdp);
 out_gunlock:
-        gfs2_glock_dq_m(3, ghs);
+        gfs2_glock_dq(ghs + 2);
-out:
+out_rgrp:
-        gfs2_holder_uninit(ghs);
-        gfs2_holder_uninit(ghs + 1);
        gfs2_holder_uninit(ghs + 2);
+        gfs2_glock_dq(ghs + 1);
+out_child:
+        gfs2_holder_uninit(ghs + 1);
+        gfs2_glock_dq(ghs);
+out_parent:
+        gfs2_holder_uninit(ghs);
        gfs2_glock_dq_uninit(&ri_gh);
        return error;
 }
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
        return 0;
 }
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+        struct inode *dir = &to->i_inode;
+        struct super_block *sb = dir->i_sb;
+        struct inode *tmp;
+        struct qstr dotdot;
+        int error = 0;
+        gfs2_str2qstr(&dotdot, "..");
+        igrab(dir);
+        for (;;) {
+                if (dir == &this->i_inode) {
+                        error = -EINVAL;
+                        break;
+                }
+                if (dir == sb->s_root->d_inode) {
+                        error = 0;
+                        break;
+                }
+                tmp = gfs2_lookupi(dir, &dotdot, 1);
+                if (IS_ERR(tmp)) {
+                        error = PTR_ERR(tmp);
+                        break;
+                }
+                iput(dir);
+                dir = tmp;
+        }
+        iput(dir);
+        return error;
+}
 /**
 * gfs2_rename - Rename a file
 * @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
        struct gfs2_inode *nip = NULL;
        struct gfs2_sbd *sdp = GFS2_SB(odir);
-        struct gfs2_holder ghs[5], r_gh;
+        struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
        struct gfs2_rgrpd *nrgd;
        unsigned int num_gh;
        int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        return 0;
        }
-        /* Make sure we aren't trying to move a dirctory into it's subdir */
-        if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
-                dir_rename = 1;
-                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0,
+        if (odip != ndip) {
-                                           &r_gh);
+                error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+                                           0, &r_gh);
                if (error)
                        goto out;
-                error = gfs2_ok_to_move(ip, ndip);
+                if (S_ISDIR(ip->i_inode.i_mode)) {
-                if (error)
+                        dir_rename = 1;
-                        goto out_gunlock_r;
+                        /* don't move a dirctory into it's subdir */
+                        error = gfs2_ok_to_move(ip, ndip);
+                        if (error)
+                                goto out_gunlock_r;
+                }
        }
        num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
        }
-        error = gfs2_glock_nq_m(num_gh, ghs);
+        for (x = 0; x < num_gh; x++) {
-        if (error)
+                error = gfs2_glock_nq(ghs + x);
-                goto out_uninit;
+                if (error)
+                        goto out_gunlock;
+        }
        /* Check out the old directory */
@@ -804,12 +873,12 @@ out_alloc:
        if (alloc_required)
                gfs2_alloc_put(ndip);
 out_gunlock:
-        gfs2_glock_dq_m(num_gh, ghs);
+        while (x--) {
-out_uninit:
+                gfs2_glock_dq(ghs + x);
-        for (x = 0; x < num_gh; x++)
                gfs2_holder_uninit(ghs + x);
+        }
 out_gunlock_r:
-        if (dir_rename)
+        if (r_gh.gh_gl)
                gfs2_glock_dq_uninit(&r_gh);
 out:
        return error;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a356..d5355d9b5926 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -38,6 +39,7 @@
 #include "dir.h"
 #include "eattr.h"
 #include "bmap.h"
+#include "meta_io.h"
 /**
 * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
 static int gfs2_write_inode(struct inode *inode, int sync)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        /* Check this is a "normal" inode */
+        struct gfs2_holder gh;
-        if (test_bit(GIF_USER, &ip->i_flags)) {
+        struct buffer_head *bh;
-                if (current->flags & PF_MEMALLOC)
+        struct timespec atime;
-                        return 0;
+        struct gfs2_dinode *di;
-                if (sync)
+        int ret = 0;
-                        gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        /* Check this is a "normal" inode, etc */
+        if (!test_bit(GIF_USER, &ip->i_flags) ||
+            (current->flags & PF_MEMALLOC))
+                return 0;
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (ret)
+                goto do_flush;
+        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (ret)
+                goto do_unlock;
+        ret = gfs2_meta_inode_buffer(ip, &bh);
+        if (ret == 0) {
+                di = (struct gfs2_dinode *)bh->b_data;
+                atime.tv_sec = be64_to_cpu(di->di_atime);
+                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+                if (timespec_compare(&inode->i_atime, &atime) > 0) {
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_dinode_out(ip, bh->b_data);
+                }
+                brelse(bh);
        }
+        gfs2_trans_end(sdp);
+do_unlock:
+        gfs2_glock_dq_uninit(&gh);
+do_flush:
+        if (sync != 0)
+                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        return ret;
+}
-        return 0;
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+                                   &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
 }
 /**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
        struct gfs2_sbd *sdp = sb->s_fs_info;
        int error;
-        if (!sdp)
-                return;
-        if (!strncmp(sb->s_type->name, "gfs2meta", 8))
-                return; /* Nothing to do */
        /*  Unfreeze the filesystem, if we need to  */
        mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
        /*  Release stuff  */
-        iput(sdp->sd_master_dir);
        iput(sdp->sd_jindex);
        iput(sdp->sd_inum_inode);
        iput(sdp->sd_statfs_inode);
@@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb)
 *
 * Flushes the log to disk.
 */
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
        sb->s_dirt = 0;
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
                }
        }
-        if (*flags & (MS_NOATIME | MS_NODIRATIME))
-                set_bit(SDF_NOATIME, &sdp->sd_flags);
-        else
-                clear_bit(SDF_NOATIME, &sdp->sd_flags);
-        /* Don't let the VFS update atimes.  GFS2 handles this itself. */
-        *flags |= MS_NOATIME | MS_NODIRATIME;
        return error;
 }
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 * inode's blocks, or alternatively pass the baton on to another
 * node for later deallocation.
 */
 static void gfs2_drop_inode(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
        }
 }
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+        do {
+                if (d1 == d2)
+                        return 1;
+                d1 = d1->d_parent;
+        } while (!IS_ROOT(d1));
+        return 0;
+}
 /**
 * gfs2_show_options - Show mount options for /proc/mounts
 * @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
        struct gfs2_args *args = &sdp->sd_args;
+        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+                seq_printf(s, ",meta");
        if (args->ar_lockproto[0])
                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
        if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 * conversion on the iopen lock, but we can change that later. This
 * is safe, just less efficient.
 */
 static void gfs2_delete_inode(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
        clear_inode(inode);
 }
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
 {
        struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ca831991cbc2..c3ba3d9d0aac 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
 #include "trans.h"
 #include "util.h"
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-/**
- * gfs2_tune_init - Fill a gfs2_tune structure with default values
- * @gt: tune
- *
- */
-void gfs2_tune_init(struct gfs2_tune *gt)
-{
-        spin_lock_init(&gt->gt_spin);
-        gt->gt_demote_secs = 300;
-        gt->gt_incore_log_blocks = 1024;
-        gt->gt_log_flush_secs = 60;
-        gt->gt_recoverd_secs = 60;
-        gt->gt_logd_secs = 1;
-        gt->gt_quotad_secs = 5;
-        gt->gt_quota_simul_sync = 64;
-        gt->gt_quota_warn_period = 10;
-        gt->gt_quota_scale_num = 1;
-        gt->gt_quota_scale_den = 1;
-        gt->gt_quota_cache_secs = 300;
-        gt->gt_quota_quantum = 60;
-        gt->gt_atime_quantum = 3600;
-        gt->gt_new_files_jdata = 0;
-        gt->gt_max_readahead = 1 << 18;
-        gt->gt_stall_secs = 600;
-        gt->gt_complain_secs = 10;
-        gt->gt_statfs_quantum = 30;
-        gt->gt_statfs_slow = 0;
-}
-/**
- * gfs2_check_sb - Check superblock
- * @sdp: the filesystem
- * @sb: The superblock
- * @silent: Don't print a message if the check fails
- *
- * Checks the version code of the FS is one that we understand how to
- * read and that the sizes of the various on-disk structures have not
- * changed.
- */
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
-{
-        unsigned int x;
-        if (sb->sb_magic != GFS2_MAGIC ||
-            sb->sb_type != GFS2_METATYPE_SB) {
-                if (!silent)
-                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
-                return -EINVAL;
-        }
-        /*  If format numbers match exactly, we're done.  */
-        if (sb->sb_fs_format == GFS2_FORMAT_FS &&
-            sb->sb_multihost_format == GFS2_FORMAT_MULTI)
-                return 0;
-        if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-                for (x = 0; gfs2_old_fs_formats[x]; x++)
-                        if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-                                break;
-                if (!gfs2_old_fs_formats[x]) {
-                        printk(KERN_WARNING
-                               "GFS2: code version (%u, %u) is incompatible "
-                               "with ondisk format (%u, %u)\n",
-                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                               sb->sb_fs_format, sb->sb_multihost_format);
-                        printk(KERN_WARNING
-                               "GFS2: I don't know how to upgrade this FS\n");
-                        return -EINVAL;
-                }
-        }
-        if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-                for (x = 0; gfs2_old_multihost_formats[x]; x++)
-                        if (gfs2_old_multihost_formats[x] ==
-                            sb->sb_multihost_format)
-                                break;
-                if (!gfs2_old_multihost_formats[x]) {
-                        printk(KERN_WARNING
-                               "GFS2: code version (%u, %u) is incompatible "
-                               "with ondisk format (%u, %u)\n",
-                               GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                               sb->sb_fs_format, sb->sb_multihost_format);
-                        printk(KERN_WARNING
-                               "GFS2: I don't know how to upgrade this FS\n");
-                        return -EINVAL;
-                }
-        }
-        if (!sdp->sd_args.ar_upgrade) {
-                printk(KERN_WARNING
-                       "GFS2: code version (%u, %u) is incompatible "
-                       "with ondisk format (%u, %u)\n",
-                       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                       sb->sb_fs_format, sb->sb_multihost_format);
-                printk(KERN_INFO
-                       "GFS2: Use the \"upgrade\" mount option to upgrade "
-                       "the FS\n");
-                printk(KERN_INFO "GFS2: See the manual for more details\n");
-                return -EINVAL;
-        }
-        return 0;
-}
-static void end_bio_io_page(struct bio *bio, int error)
-{
-        struct page *page = bio->bi_private;
-        if (!error)
-                SetPageUptodate(page);
-        else
-                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
-        unlock_page(page);
-}
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
-{
-        const struct gfs2_sb *str = buf;
-        sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
-        sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
-        sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
-        sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
-        sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
-        sb->sb_bsize = be32_to_cpu(str->sb_bsize);
-        sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
-        sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
-        sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
-        sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
-        sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
-        memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
-        memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-}
-/**
- * gfs2_read_super - Read the gfs2 super block from disk
- * @sdp: The GFS2 super block
- * @sector: The location of the super block
- * @error: The error code to return
- *
- * This uses the bio functions to read the super block from disk
- * because we want to be 100% sure that we never read cached data.
- * A super block is read twice only during each GFS2 mount and is
- * never written to by the filesystem. The first time its read no
- * locks are held, and the only details which are looked at are those
- * relating to the locking protocol. Once locking is up and working,
- * the sb is read again under the lock to establish the location of
- * the master directory (contains pointers to journals etc) and the
- * root directory.
- *
- * Returns: 0 on success or error
- */
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
-{
-        struct super_block *sb = sdp->sd_vfs;
-        struct gfs2_sb *p;
-        struct page *page;
-        struct bio *bio;
-        page = alloc_page(GFP_NOFS);
-        if (unlikely(!page))
-                return -ENOBUFS;
-        ClearPageUptodate(page);
-        ClearPageDirty(page);
-        lock_page(page);
-        bio = bio_alloc(GFP_NOFS, 1);
-        if (unlikely(!bio)) {
-                __free_page(page);
-                return -ENOBUFS;
-        }
-        bio->bi_sector = sector * (sb->s_blocksize >> 9);
-        bio->bi_bdev = sb->s_bdev;
-        bio_add_page(bio, page, PAGE_SIZE, 0);
-        bio->bi_end_io = end_bio_io_page;
-        bio->bi_private = page;
-        submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
-        wait_on_page_locked(page);
-        bio_put(bio);
-        if (!PageUptodate(page)) {
-                __free_page(page);
-                return -EIO;
-        }
-        p = kmap(page);
-        gfs2_sb_in(&sdp->sd_sb, p);
-        kunmap(page);
-        __free_page(page);
-        return 0;
-}
-/**
- * gfs2_read_sb - Read super block
- * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
- * @silent: Don't print message if mount fails
- *
- */
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
-{
-        u32 hash_blocks, ind_blocks, leaf_blocks;
-        u32 tmp_blocks;
-        unsigned int x;
-        int error;
-        error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
-        if (error) {
-                if (!silent)
-                        fs_err(sdp, "can't read superblock\n");
-                return error;
-        }
-        error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-        if (error)
-                return error;
-        sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
-                               GFS2_BASIC_BLOCK_SHIFT;
-        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-        sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
-                          sizeof(struct gfs2_dinode)) / sizeof(u64);
-        sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
-                          sizeof(struct gfs2_meta_header)) / sizeof(u64);
-        sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
-        sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
-        sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
-        sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
-        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
-                                sizeof(struct gfs2_meta_header)) /
-                                sizeof(struct gfs2_quota_change);
-        /* Compute maximum reservation required to add a entry to a directory */
-        hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
-                             sdp->sd_jbsize);
-        ind_blocks = 0;
-        for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
-                tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
-                ind_blocks += tmp_blocks;
-        }
-        leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
-        sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
-        sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
-                                sizeof(struct gfs2_dinode);
-        sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
-        for (x = 2;; x++) {
-                u64 space, d;
-                u32 m;
-                space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
-                d = space;
-                m = do_div(d, sdp->sd_inptrs);
-                if (d != sdp->sd_heightsize[x - 1] || m)
-                        break;
-                sdp->sd_heightsize[x] = space;
-        }
-        sdp->sd_max_height = x;
-        sdp->sd_heightsize[x] = ~0;
-        gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
-        sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
-                                 sizeof(struct gfs2_dinode);
-        sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
-        for (x = 2;; x++) {
-                u64 space, d;
-                u32 m;
-                space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
-                d = space;
-                m = do_div(d, sdp->sd_inptrs);
-                if (d != sdp->sd_jheightsize[x - 1] || m)
-                        break;
-                sdp->sd_jheightsize[x] = space;
-        }
-        sdp->sd_max_jheight = x;
-        sdp->sd_jheightsize[x] = ~0;
-        gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-        return 0;
-}
 /**
 * gfs2_jindex_hold - Grab a lock on the jindex
 * @sdp: The GFS2 superblock
@@ -581,39 +274,6 @@ fail:
        return error;
 }
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-        struct gfs2_holder t_gh;
-        int error;
-        gfs2_quota_sync(sdp);
-        gfs2_statfs_sync(sdp);
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-                                   &t_gh);
-        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return error;
-        gfs2_meta_syncfs(sdp);
-        gfs2_log_shutdown(sdp);
-        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-        if (t_gh.gh_gl)
-                gfs2_glock_dq_uninit(&t_gh);
-        gfs2_quota_cleanup(sdp);
-        return error;
-}
 static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
        const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f7..50a4c9b1215e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
 #include "incore.h"
-void gfs2_tune_init(struct gfs2_tune *gt);
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
                              struct gfs2_inode **ipp);
 int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3f..7e1879f1a02c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
 ARGS_ATTR(data,            "%d\n");
-/* one oddball doesn't fit the macro mold */
-static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%d\n",
-                        !!test_bit(SDF_NOATIME, &sdp->sd_flags));
-}
-static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
 static struct attribute *args_attrs[] = {
        &args_attr_lockproto.attr,
        &args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
        &args_attr_quota.attr,
        &args_attr_suiddir.attr,
        &args_attr_data.attr,
-        &args_attr_noatime.attr,
        NULL,
 };
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
-TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
        &tune_attr_quota_quantum.attr,
-        &tune_attr_atime_quantum.attr,
        &tune_attr_max_readahead.attr,
        &tune_attr_complain_secs.attr,
        &tune_attr_statfs_slow.attr,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4abb1047c689..3c7c7637719c 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -173,7 +173,7 @@ enum {
        opt_err
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        { opt_uid, "uid=%u" },
        { opt_gid, "gid=%u" },
        { opt_umask, "umask=%o" },
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 9997cbf8beb5..9699c56d323f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -25,7 +25,7 @@ enum {
        opt_force, opt_err
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        { opt_creator, "creator=%s" },
        { opt_type, "type=%s" },
        { opt_umask, "umask=%o" },
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8ae9c90ada0..29ad461d568f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -215,7 +215,7 @@ enum {
        Opt_timeshift, Opt_err,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_help, "help"},
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3f58923fb39b..61edc701b0e6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -57,7 +57,7 @@ enum {
        Opt_err,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_size,      "size=%s"},
        {Opt_nr_inodes, "nr_inodes=%s"},
        {Opt_mode,      "mode=%o"},
diff --git a/fs/inode.c b/fs/inode.c
index b6726f644530..0487ddba1397 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
                mapping->assoc_mapping = NULL;
                mapping->backing_dev_info = &default_backing_dev_info;
+                mapping->writeback_index = 0;
                /*
                 * If the block_device provides a backing_dev_info for client
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 60249429a253..d85c7d931cdf 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
 }
 /*
- * remove_kevent - cleans up and ultimately frees the given kevent
+ * remove_kevent - cleans up the given kevent
 *
 * Caller must hold dev->ev_mutex.
 */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
        dev->event_count--;
        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
        kfree(kevent->name);
        kmem_cache_free(event_cachep, kevent);
 }
@@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
                struct inotify_kernel_event *kevent;
                kevent = inotify_dev_get_event(dev);
                remove_kevent(dev, kevent);
+                free_kevent(kevent);
        }
 }
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
        dev = file->private_data;
        while (1) {
-                int events;
                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
                mutex_lock(&dev->ev_mutex);
-                events = !list_empty(&dev->events);
+                if (!list_empty(&dev->events)) {
-                mutex_unlock(&dev->ev_mutex);
-                if (events) {
                        ret = 0;
                        break;
                }
+                mutex_unlock(&dev->ev_mutex);
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
        if (ret)
                return ret;
-        mutex_lock(&dev->ev_mutex);
        while (1) {
                struct inotify_kernel_event *kevent;
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                        }
                        break;
                }
+                remove_kevent(dev, kevent);
+                /*
+                 * Must perform the copy_to_user outside the mutex in order
+                 * to avoid a lock order reversal with mmap_sem.
+                 */
+                mutex_unlock(&dev->ev_mutex);
                if (copy_to_user(buf, &kevent->event, event_size)) {
                        ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                        count -= kevent->event.len;
                }
-                remove_kevent(dev, kevent);
+                free_kevent(kevent);
+                mutex_lock(&dev->ev_mutex);
        }
        mutex_unlock(&dev->ev_mutex);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d3..d152856c371b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 #include <asm/ioctls.h>
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS      (UINT_MAX / sizeof(struct fiemap_extent))
 /**
 * vfs_ioctl - call filesystem specific ioctl methods
 * @filp:       open file to invoke ioctl method on
@@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
        return put_user(res, p);
 }
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo:    Fiemap context passed into ->fiemap
+ * @logical:    Extent logical start offset, in bytes
+ * @phys:       Extent physical start offset, in bytes
+ * @len:        Extent length, in bytes
+ * @flags:      FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS       (FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS       (FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS   (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+                            u64 phys, u64 len, u32 flags)
+{
+        struct fiemap_extent extent;
+        struct fiemap_extent *dest = fieinfo->fi_extents_start;
+        /* only count the extents */
+        if (fieinfo->fi_extents_max == 0) {
+                fieinfo->fi_extents_mapped++;
+                return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+        }
+        if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+                return 1;
+        if (flags & SET_UNKNOWN_FLAGS)
+                flags |= FIEMAP_EXTENT_UNKNOWN;
+        if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+                flags |= FIEMAP_EXTENT_ENCODED;
+        if (flags & SET_NOT_ALIGNED_FLAGS)
+                flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+        memset(&extent, 0, sizeof(extent));
+        extent.fe_logical = logical;
+        extent.fe_physical = phys;
+        extent.fe_length = len;
+        extent.fe_flags = flags;
+        dest += fieinfo->fi_extents_mapped;
+        if (copy_to_user(dest, &extent, sizeof(extent)))
+                return -EFAULT;
+        fieinfo->fi_extents_mapped++;
+        if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+                return 1;
+        return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo:    Fiemap context passed into ->fiemap
+ * @fs_flags:   Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+        u32 incompat_flags;
+        incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+        if (incompat_flags) {
+                fieinfo->fi_flags = incompat_flags;
+                return -EBADR;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+static int fiemap_check_ranges(struct super_block *sb,
+                               u64 start, u64 len, u64 *new_len)
+{
+        *new_len = len;
+        if (len == 0)
+                return -EINVAL;
+        if (start > sb->s_maxbytes)
+                return -EFBIG;
+        /*
+         * Shrink request scope to what the fs can actually handle.
+         */
+        if ((len > sb->s_maxbytes) ||
+            (sb->s_maxbytes - len) < start)
+                *new_len = sb->s_maxbytes - start;
+        return 0;
+}
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+        struct fiemap fiemap;
+        struct fiemap_extent_info fieinfo = { 0, };
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct super_block *sb = inode->i_sb;
+        u64 len;
+        int error;
+        if (!inode->i_op->fiemap)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+                           sizeof(struct fiemap)))
+                return -EFAULT;
+        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+                return -EINVAL;
+        error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+                                    &len);
+        if (error)
+                return error;
+        fieinfo.fi_flags = fiemap.fm_flags;
+        fieinfo.fi_extents_max = fiemap.fm_extent_count;
+        fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+        if (fiemap.fm_extent_count != 0 &&
+            !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+                       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+                return -EFAULT;
+        if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+                filemap_write_and_wait(inode->i_mapping);
+        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+        fiemap.fm_flags = fieinfo.fi_flags;
+        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+        if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+                error = -EFAULT;
+        return error;
+}
+#ifdef CONFIG_BLOCK
+#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+/*
+ * @inode - the inode to map
+ * @arg - the pointer to userspace where we copy everything to
+ * @get_block - the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes.  Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size
+ */
+int generic_block_fiemap(struct inode *inode,
+                         struct fiemap_extent_info *fieinfo, u64 start,
+                         u64 len, get_block_t *get_block)
+{
+        struct buffer_head tmp;
+        unsigned int start_blk;
+        long long length = 0, map_len = 0;
+        u64 logical = 0, phys = 0, size = 0;
+        u32 flags = FIEMAP_EXTENT_MERGED;
+        int ret = 0;
+        if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+                return ret;
+        start_blk = logical_to_blk(inode, start);
+        /* guard against change */
+        mutex_lock(&inode->i_mutex);
+        length = (long long)min_t(u64, len, i_size_read(inode));
+        map_len = length;
+        do {
+                /*
+                 * we set b_size to the total size we want so it will map as
+                 * many contiguous blocks as possible at once
+                 */
+                memset(&tmp, 0, sizeof(struct buffer_head));
+                tmp.b_size = map_len;
+                ret = get_block(inode, start_blk, &tmp, 0);
+                if (ret)
+                        break;
+                /* HOLE */
+                if (!buffer_mapped(&tmp)) {
+                        /*
+                         * first hole after going past the EOF, this is our
+                         * last extent
+                         */
+                        if (length <= 0) {
+                                flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+                                ret = fiemap_fill_next_extent(fieinfo, logical,
+                                                              phys, size,
+                                                              flags);
+                                break;
+                        }
+                        length -= blk_to_logical(inode, 1);
+                        /* if we have holes up to/past EOF then we're done */
+                        if (length <= 0)
+                                break;
+                        start_blk++;
+                } else {
+                        if (length <= 0 && size) {
+                                ret = fiemap_fill_next_extent(fieinfo, logical,
+                                                              phys, size,
+                                                              flags);
+                                if (ret)
+                                        break;
+                        }
+                        logical = blk_to_logical(inode, start_blk);
+                        phys = blk_to_logical(inode, tmp.b_blocknr);
+                        size = tmp.b_size;
+                        flags = FIEMAP_EXTENT_MERGED;
+                        length -= tmp.b_size;
+                        start_blk += logical_to_blk(inode, size);
+                        /*
+                         * if we are past the EOF we need to loop again to see
+                         * if there is a hole so we can mark this extent as the
+                         * last one, and if not keep mapping things until we
+                         * find a hole, or we run out of slots in the extent
+                         * array
+                         */
+                        if (length <= 0)
+                                continue;
+                        ret = fiemap_fill_next_extent(fieinfo, logical, phys,
+                                                      size, flags);
+                        if (ret)
+                                break;
+                }
+                cond_resched();
+        } while (1);
+        mutex_unlock(&inode->i_mutex);
+        /* if ret is 1 then we just hit the end of the extent array */
+        if (ret == 1)
+                ret = 0;
+        return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+#endif  /*  CONFIG_BLOCK  */
 static int file_ioctl(struct file *filp, unsigned int cmd,
                unsigned long arg)
 {
@@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
+        case FS_IOC_FIEMAP:
+                return ioctl_fiemap(filp, arg);
        case FIGETBSZ:
                return put_user(inode->i_sb->s_blocksize, p);
        case FIONREAD:
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c4a1c3c65aac..da3cc460d4df 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
                                        break;
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
@@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
                                        continue;
@@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                        ret = tmpio;
                                else
                                        ret = ioprio_best(ret, tmpio);
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 26948a6033b6..3f8af0f1505b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -310,7 +310,7 @@ enum {
        Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_norock, "norock"},
        {Opt_nojoliet, "nojoliet"},
        {Opt_unhide, "unhide"},
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 8dee32007500..0540ca27a446 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                goto out;
        }
-        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
@@ -1448,7 +1448,7 @@ int journal_stop(handle_t *handle)
                spin_unlock(&journal->j_state_lock);
        }
-        lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&handle->h_lockdep_map);
        jbd_free_handle(handle);
        return err;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8a..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        int ret = 0;
        struct buffer_head *bh = jh2bh(jh);
-        if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+        if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+            !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
@@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
                /*
                 * Test again, another process may have checkpointed while we
-                 * were waiting for the checkpoint lock
+                 * were waiting for the checkpoint lock. If there are no
+                 * outstanding transactions there is nothing to checkpoint and
+                 * we can't make progress. Abort the journal in this case.
                 */
                spin_lock(&journal->j_state_lock);
+                spin_lock(&journal->j_list_lock);
                nblocks = jbd_space_needed(journal);
                if (__jbd2_log_space_left(journal) < nblocks) {
+                        int chkpt = journal->j_checkpoint_transactions != NULL;
+                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&journal->j_state_lock);
-                        jbd2_log_do_checkpoint(journal);
+                        if (chkpt) {
+                                jbd2_log_do_checkpoint(journal);
+                        } else {
+                                printk(KERN_ERR "%s: no transactions\n",
+                                       __func__);
+                                jbd2_journal_abort(journal, 0);
+                        }
                        spin_lock(&journal->j_state_lock);
+                } else {
+                        spin_unlock(&journal->j_list_lock);
                }
                mutex_unlock(&journal->j_checkpoint_mutex);
        }
@@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 * buffers. Note that we take the buffers in the opposite ordering
 * from the one in which they were submitted for IO.
 *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
 * Called with j_list_lock held.
 */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
        tid_t this_tid;
        int released = 0;
+        int ret = 0;
        this_tid = transaction->t_tid;
 restart:
        /* Did somebody clean up the transaction in the meanwhile? */
        if (journal->j_checkpoint_transactions != transaction ||
                        transaction->t_tid != this_tid)
-                return;
+                return ret;
        while (!released && transaction->t_checkpoint_io_list) {
                jh = transaction->t_checkpoint_io_list;
                bh = jh2bh(jh);
@@ -194,6 +215,9 @@ restart:
                        spin_lock(&journal->j_list_lock);
                        goto restart;
                }
+                if (unlikely(buffer_write_io_error(bh)))
+                        ret = -EIO;
                /*
                 * Now in whatever state the buffer currently is, we know that
                 * it has been written out and so we can drop it from the list
@@ -203,6 +227,8 @@ restart:
                jbd2_journal_remove_journal_head(bh);
                __brelse(bh);
        }
+        return ret;
 }
 #define NR_BATCH        64
@@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Try to flush one buffer from the checkpoint list to disk.
 *
 * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
 *
 * Called with j_list_lock held and drops it if 1 is returned
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
        } else if (!buffer_dirty(bh)) {
+                ret = 1;
+                if (unlikely(buffer_write_io_error(bh)))
+                        ret = -EIO;
                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __jbd2_journal_remove_checkpoint(jh);
@@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                jbd_unlock_bh_state(bh);
                jbd2_journal_remove_journal_head(bh);
                __brelse(bh);
-                ret = 1;
        } else {
                /*
                 * Important: we are about to write the buffer, and
@@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 * to disk. We submit larger chunks of data at once.
 *
 * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
 */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
@@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
         * journal straight away.
         */
        result = jbd2_cleanup_journal_tail(journal);
+        trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+                   journal->j_devname, result);
        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
        if (result <= 0)
                return result;
@@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
         * OK, we need to start writing disk blocks.  Take one transaction
         * and write it.
         */
+        result = 0;
        spin_lock(&journal->j_list_lock);
        if (!journal->j_checkpoint_transactions)
                goto out;
@@ -339,7 +372,7 @@ restart:
                int batch_count = 0;
                struct buffer_head *bhs[NR_BATCH];
                struct journal_head *jh;
-                int retry = 0;
+                int retry = 0, err;
                while (!retry && transaction->t_checkpoint_list) {
                        struct buffer_head *bh;
@@ -353,6 +386,8 @@ restart:
                        }
                        retry = __process_buffer(journal, jh, bhs, &batch_count,
                                                 transaction);
+                        if (retry < 0 && !result)
+                                result = retry;
                        if (!retry && (need_resched() ||
                                spin_needbreak(&journal->j_list_lock))) {
                                spin_unlock(&journal->j_list_lock);
@@ -377,14 +412,18 @@ restart:
                 * Now we have cleaned up the first transaction's checkpoint
                 * list. Let's clean up the second one
                 */
-                __wait_cp_io(journal, transaction);
+                err = __wait_cp_io(journal, transaction);
+                if (!result)
+                        result = err;
        }
 out:
        spin_unlock(&journal->j_list_lock);
-        result = jbd2_cleanup_journal_tail(journal);
        if (result < 0)
-                return result;
+                jbd2_journal_abort(journal, result);
-        return 0;
+        else
+                result = jbd2_cleanup_journal_tail(journal);
+        return (result < 0) ? result : 0;
 }
 /*
@@ -400,8 +439,9 @@ out:
 * This is the only part of the journaling code which really needs to be
 * aware of transaction aborts.  Checkpointing involves writing to the
 * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
+ * even in abort state, but we must not update the super block if
- * we have an abort error outstanding.
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
 */
 int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        tid_t           first_tid;
        unsigned long   blocknr, freed;
+        if (is_journal_aborted(journal))
+                return 1;
        /* OK, work out the oldest transaction remaining in the log, and
         * the log block it starts at.
         *
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f2ad061e95ec..0abe02c4242a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
        JBUFFER_TRACE(descriptor, "submit commit block");
        lock_buffer(bh);
-        get_bh(bh);
+        clear_buffer_dirty(bh);
-        set_buffer_dirty(bh);
        set_buffer_uptodate(bh);
        bh->b_end_io = journal_end_buffer_io_sync;
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
         * to remember if we sent a barrier request
         */
        if (ret == -EOPNOTSUPP && barrier_done) {
-                char b[BDEVNAME_SIZE];
                printk(KERN_WARNING
-                        "JBD: barrier-based sync failed on %s - "
+                       "JBD: barrier-based sync failed on %s - "
-                        "disabling barriers\n",
+                       "disabling barriers\n", journal->j_devname);
-                        bdevname(journal->j_dev, b));
                spin_lock(&journal->j_state_lock);
                journal->j_flags &= ~JBD2_BARRIER;
                spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
                /* And try again, without the barrier */
                lock_buffer(bh);
                set_buffer_uptodate(bh);
-                set_buffer_dirty(bh);
+                clear_buffer_dirty(bh);
                ret = submit_bh(WRITE, bh);
        }
        *cbh = bh;
@@ -371,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction = journal->j_running_transaction;
        J_ASSERT(commit_transaction->t_state == T_RUNNING);
+        trace_mark(jbd2_start_commit, "dev %s transaction %d",
+                   journal->j_devname, commit_transaction->t_tid);
        jbd_debug(1, "JBD: starting commit of transaction %d\n",
                        commit_transaction->t_tid);
@@ -505,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                jh = commit_transaction->t_buffers;
                /* If we're in abort mode, we just un-journal the buffer and
-                   release it for background writing. */
+                   release it. */
                if (is_journal_aborted(journal)) {
+                        clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
                        jbd2_journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
@@ -681,11 +681,11 @@ start_journal_io:
         */
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
        if (err) {
-                char b[BDEVNAME_SIZE];
                printk(KERN_WARNING
                        "JBD2: Detected IO errors while flushing file data "
-                        "on %s\n", bdevname(journal->j_fs_dev, b));
+                       "on %s\n", journal->j_devname);
+                if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+                        jbd2_journal_abort(journal, err);
                err = 0;
        }
@@ -786,6 +786,9 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
+        if (err)
+                jbd2_journal_abort(journal, err);
        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -884,6 +887,8 @@ restart_loop:
                if (buffer_jbddirty(bh)) {
                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
+                        if (is_journal_aborted(journal))
+                                clear_buffer_jbddirty(bh);
                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
                        __jbd2_journal_refile_buffer(jh);
                        jbd_unlock_bh_state(bh);
@@ -990,6 +995,9 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
+        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+                   journal->j_devname, commit_transaction->t_tid,
+                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8207a01c4edb..783de118de92 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
                if (ret)
                        *retp = ret;
                else {
-                        char b[BDEVNAME_SIZE];
                        printk(KERN_ALERT "%s: journal block not found "
                                        "at offset %lu on %s\n",
-                                __func__,
+                               __func__, blocknr, journal->j_devname);
-                                blocknr,
-                                bdevname(journal->j_dev, b));
                        err = -EIO;
                        __journal_abort_soft(journal, err);
                }
@@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
 static void jbd2_stats_proc_init(journal_t *journal)
 {
-        char name[BDEVNAME_SIZE];
+        journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
-        bdevname(journal->j_dev, name);
-        journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
        if (journal->j_proc_entry) {
                proc_create_data("history", S_IRUGO, journal->j_proc_entry,
                                 &jbd2_seq_history_fops, journal);
@@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
 static void jbd2_stats_proc_exit(journal_t *journal)
 {
-        char name[BDEVNAME_SIZE];
-        bdevname(journal->j_dev, name);
        remove_proc_entry("info", journal->j_proc_entry);
        remove_proc_entry("history", journal->j_proc_entry);
-        remove_proc_entry(name, proc_jbd2_stats);
+        remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 static void journal_init_stats(journal_t *journal)
@@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 {
        journal_t *journal = journal_init_common();
        struct buffer_head *bh;
+        char *p;
        int n;
        if (!journal)
@@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        journal->j_fs_dev = fs_dev;
        journal->j_blk_offset = start;
        journal->j_maxlen = len;
+        bdevname(journal->j_dev, journal->j_devname);
+        p = journal->j_devname;
+        while ((p = strchr(p, '/')))
+                *p = '!';
        jbd2_stats_proc_init(journal);
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 {
        struct buffer_head *bh;
        journal_t *journal = journal_init_common();
+        char *p;
        int err;
        int n;
        unsigned long long blocknr;
@@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
        journal->j_inode = inode;
+        bdevname(journal->j_dev, journal->j_devname);
+        p = journal->j_devname;
+        while ((p = strchr(p, '/')))
+                *p = '!';
+        p = journal->j_devname + strlen(journal->j_devname);
+        sprintf(p, ":%lu", journal->j_inode->i_ino);
        jbd_debug(1,
                  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
                  journal, inode->i_sb->s_id, inode->i_ino,
@@ -1253,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
                goto out;
        }
+        if (buffer_write_io_error(bh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the journal
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "JBD2: previous I/O error detected "
+                       "for journal superblock update for %s.\n",
+                       journal->j_devname);
+                clear_buffer_write_io_error(bh);
+                set_buffer_uptodate(bh);
+        }
        spin_lock(&journal->j_state_lock);
        jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1264,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
        BUFFER_TRACE(bh, "marking dirty");
        mark_buffer_dirty(bh);
-        if (wait)
+        if (wait) {
                sync_dirty_buffer(bh);
-        else
+                if (buffer_write_io_error(bh)) {
+                        printk(KERN_ERR "JBD2: I/O error detected "
+                               "when updating journal superblock for %s.\n",
+                               journal->j_devname);
+                        clear_buffer_write_io_error(bh);
+                        set_buffer_uptodate(bh);
+                }
+        } else
                ll_rw_block(SWRITE, 1, &bh);
 out:
@@ -1426,9 +1451,12 @@ recovery_error:
 *
 * Release a journal_t structure once it is no longer in use by the
 * journaled object.
+ * Return <0 if we couldn't clean up the journal.
 */
-void jbd2_journal_destroy(journal_t *journal)
+int jbd2_journal_destroy(journal_t *journal)
 {
+        int err = 0;
        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);
@@ -1451,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
        J_ASSERT(journal->j_checkpoint_transactions == NULL);
        spin_unlock(&journal->j_list_lock);
-        /* We can now mark the journal as empty. */
-        journal->j_tail = 0;
-        journal->j_tail_sequence = ++journal->j_transaction_sequence;
        if (journal->j_sb_buffer) {
-                jbd2_journal_update_superblock(journal, 1);
+                if (!is_journal_aborted(journal)) {
+                        /* We can now mark the journal as empty. */
+                        journal->j_tail = 0;
+                        journal->j_tail_sequence =
+                                ++journal->j_transaction_sequence;
+                        jbd2_journal_update_superblock(journal, 1);
+                } else {
+                        err = -EIO;
+                }
                brelse(journal->j_sb_buffer);
        }
@@ -1467,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
                jbd2_journal_destroy_revoke(journal);
        kfree(journal->j_wbuf);
        kfree(journal);
+        return err;
 }
@@ -1692,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
        spin_lock(&journal->j_list_lock);
        while (!err && journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
+                mutex_lock(&journal->j_checkpoint_mutex);
                err = jbd2_log_do_checkpoint(journal);
+                mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
+        if (is_journal_aborted(journal))
+                return -EIO;
        jbd2_cleanup_journal_tail(journal);
        /* Finally, mark the journal as really needing no recovery.
@@ -1717,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
        spin_unlock(&journal->j_state_lock);
-        return err;
+        return 0;
 }
 /**
@@ -1761,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 }
 /*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
-        struct block_device *bdev;
-        if (journal->j_inode)
-                bdev = journal->j_inode->i_sb->s_bdev;
-        else
-                bdev = journal->j_dev;
-        return bdevname(bdev, buffer);
-}
-/*
 * Journal abort has very specific semantics, which we describe
 * for journal abort.
 *
@@ -1793,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
 void __jbd2_journal_abort_hard(journal_t *journal)
 {
        transaction_t *transaction;
-        char b[BDEVNAME_SIZE];
        if (journal->j_flags & JBD2_ABORT)
                return;
        printk(KERN_ERR "Aborting journal on device %s.\n",
-                journal_dev_name(journal, b));
+               journal->j_devname);
        spin_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_ABORT;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b76..73063285b13f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do {									\
 */
 int jbd2_journal_recover(journal_t *journal)
 {
-        int                     err;
+        int                     err, err2;
        journal_superblock_t *  sb;
        struct recovery_info    info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
        journal->j_transaction_sequence = ++info.end_transaction;
        jbd2_journal_clear_revoke(journal);
-        sync_blockdev(journal->j_fs_dev);
+        err2 = sync_blockdev(journal->j_fs_dev);
+        if (!err)
+                err = err2;
        return err;
 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f7cadbb19fa..e5d540588fa9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -301,7 +301,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
                goto out;
        }
-        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
 }
@@ -1279,7 +1279,7 @@ int jbd2_journal_stop(handle_t *handle)
                spin_unlock(&journal->j_state_lock);
        }
-        lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&handle->h_lockdep_map);
        jbd2_free_handle(handle);
        return err;
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdde..4c41db91eaa4 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
 #ifndef _JFFS2_FS_I
 #define _JFFS2_FS_I
-#include <linux/version.h>
 #include <linux/rbtree.h>
 #include <linux/posix_acl.h>
 #include <linux/mutex.h>
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3630718be395..0dae345e481b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -199,7 +199,7 @@ enum {
        Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_integrity, "integrity"},
        {Opt_nointegrity, "nointegrity"},
        {Opt_iocharset, "iocharset=%s"},
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a555..97f6073ab339 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
 obj-$(CONFIG_LOCKD) += lockd.o
 lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
-                svcproc.o svcsubs.o mon.o xdr.o
+                svcproc.o svcsubs.o mon.o xdr.o grace.o
 lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
 lockd-objs                    := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bfd..8307dd64bf46 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
        u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
        int status;
-        status = lockd_up(nlm_init->protocol);
+        status = lockd_up();
        if (status < 0)
                return ERR_PTR(status);
-        host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+        host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
                                   nlm_init->protocol, nlm_version,
-                                   nlm_init->hostname,
+                                   nlm_init->hostname);
-                                   strlen(nlm_init->hostname));
        if (host == NULL) {
                lockd_down();
                return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 /*
 * The server lockd has called us back to tell us the lock was granted
 */
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 {
        const struct file_lock *fl = &lock->fl;
        const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
                 */
                if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
                        continue;
-                if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+                if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
                        continue;
                if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
                        continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
        /* This one ensures that our parent doesn't terminate while the
         * reclaim is in progress */
        lock_kernel();
-        lockd_up(0); /* note: this cannot fail as lockd is already running */
+        lockd_up();     /* note: this cannot fail as lockd is already running */
        dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 000000000000..183cc1f0af1c
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out.  Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+        spin_lock(&grace_lock);
+        list_add(&lm->list, &grace_list);
+        spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking.  The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+        spin_lock(&grace_lock);
+        list_del_init(&lm->list);
+        spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+        return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eacc..9fd8889097b7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
+#include <net/ipv6.h>
 #define NLMDBG_FACILITY         NLMDBG_HOSTCACHE
 #define NLM_HOST_NRHASH         32
-#define NLM_ADDRHASH(addr)      (ntohl(addr) & (NLM_HOST_NRHASH-1))
 #define NLM_HOST_REBIND         (60 * HZ)
 #define NLM_HOST_EXPIRE         (300 * HZ)
 #define NLM_HOST_COLLECT        (120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long		next_gc;
 static int                      nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
-static struct nsm_handle *      __nsm_find(const struct sockaddr_in *,
+static struct nsm_handle        *nsm_find(const struct sockaddr *sap,
-                                        const char *, unsigned int, int);
+                                                const size_t salen,
-static struct nsm_handle *      nsm_find(const struct sockaddr_in *sin,
+                                                const char *hostname,
-                                         const char *hostname,
+                                                const size_t hostname_len,
-                                         unsigned int hostname_len);
+                                                const int create);
+struct nlm_lookup_host_info {
+        const int               server;         /* search for server|client */
+        const struct sockaddr   *sap;           /* address to search for */
+        const size_t            salen;          /* it's length */
+        const unsigned short    protocol;       /* transport to search for*/
+        const u32               version;        /* NLM version to search for */
+        const char              *hostname;      /* remote's hostname */
+        const size_t            hostname_len;   /* it's length */
+        const struct sockaddr   *src_sap;       /* our address (optional) */
+        const size_t            src_len;        /* it's length */
+};
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+        unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+        return hash ^ (hash >> 8);
+}
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        return __nlm_hash32(sin->sin_addr.s_addr);
+}
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        const struct in6_addr addr = sin6->sin6_addr;
+        return __nlm_hash32(addr.s6_addr32[0]) ^
+               __nlm_hash32(addr.s6_addr32[1]) ^
+               __nlm_hash32(addr.s6_addr32[2]) ^
+               __nlm_hash32(addr.s6_addr32[3]);
+}
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+        unsigned int hash;
+        switch (sap->sa_family) {
+        case AF_INET:
+                hash = __nlm_hash_addr4(sap);
+                break;
+        case AF_INET6:
+                hash = __nlm_hash_addr6(sap);
+                break;
+        default:
+                hash = 0;
+        }
+        return hash & (NLM_HOST_NRHASH - 1);
+}
+static void nlm_clear_port(struct sockaddr *sap)
+{
+        switch (sap->sa_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)sap)->sin_port = 0;
+                break;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)sap)->sin6_port = 0;
+                break;
+        }
+}
+static void nlm_display_address(const struct sockaddr *sap,
+                                char *buf, const size_t len)
+{
+        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        switch (sap->sa_family) {
+        case AF_UNSPEC:
+                snprintf(buf, len, "unspecified");
+                break;
+        case AF_INET:
+                snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+                break;
+        case AF_INET6:
+                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+                        snprintf(buf, len, NIPQUAD_FMT,
+                                 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+                else
+                        snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+                break;
+        default:
+                snprintf(buf, len, "unsupported address family");
+                break;
+        }
+}
 /*
 * Common host lookup routine for server & client
 */
-static struct nlm_host *nlm_lookup_host(int server,
+static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
-                                        const struct sockaddr_in *sin,
-                                        int proto, u32 version,
-                                        const char *hostname,
-                                        unsigned int hostname_len,
-                                        const struct sockaddr_in *ssin)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
        struct nlm_host *host;
        struct nsm_handle *nsm = NULL;
-        int             hash;
-        dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
-                        ", p=%d, v=%u, my role=%s, name=%.*s)\n",
-                        NIPQUAD(ssin->sin_addr.s_addr),
-                        NIPQUAD(sin->sin_addr.s_addr), proto, version,
-                        server? "server" : "client",
-                        hostname_len,
-                        hostname? hostname : "<none>");
-        hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
-        /* Lock hash table */
        mutex_lock(&nlm_host_mutex);
        if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
         * different NLM rpc_clients into one single nlm_host object.
         * This would allow us to have one nlm_host per address.
         */
-        chain = &nlm_hosts[hash];
+        chain = &nlm_hosts[nlm_hash_address(ni->sap)];
        hlist_for_each_entry(host, pos, chain, h_hash) {
-                if (!nlm_cmp_addr(&host->h_addr, sin))
+                if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
                        continue;
                /* See if we have an NSM handle for this client */
                if (!nsm)
                        nsm = host->h_nsmhandle;
-                if (host->h_proto != proto)
+                if (host->h_proto != ni->protocol)
                        continue;
-                if (host->h_version != version)
+                if (host->h_version != ni->version)
                        continue;
-                if (host->h_server != server)
+                if (host->h_server != ni->server)
                        continue;
-                if (!nlm_cmp_addr(&host->h_saddr, ssin))
+                if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
                        continue;
                /* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
                hlist_add_head(&host->h_hash, chain);
                nlm_get_host(host);
+                dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
+                                host->h_name, host->h_addrbuf);
                goto out;
        }
-        if (nsm)
-                atomic_inc(&nsm->sm_count);
-        host = NULL;
-        /* Sadly, the host isn't in our hash table yet. See if
+        /*
-         * we have an NSM handle for it. If not, create one.
+         * The host wasn't in our hash table.  If we don't
+         * have an NSM handle for it yet, create one.
         */
-        if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
+        if (nsm)
-                goto out;
+                atomic_inc(&nsm->sm_count);
+        else {
+                host = NULL;
+                nsm = nsm_find(ni->sap, ni->salen,
+                                ni->hostname, ni->hostname_len, 1);
+                if (!nsm) {
+                        dprintk("lockd: nlm_lookup_host failed; "
+                                "no nsm handle\n");
+                        goto out;
+                }
+        }
        host = kzalloc(sizeof(*host), GFP_KERNEL);
        if (!host) {
                nsm_release(nsm);
+                dprintk("lockd: nlm_lookup_host failed; no memory\n");
                goto out;
        }
        host->h_name       = nsm->sm_name;
-        host->h_addr       = *sin;
+        memcpy(nlm_addr(host), ni->sap, ni->salen);
-        host->h_addr.sin_port = 0;      /* ouch! */
+        host->h_addrlen = ni->salen;
-        host->h_saddr      = *ssin;
+        nlm_clear_port(nlm_addr(host));
-        host->h_version    = version;
+        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
-        host->h_proto      = proto;
+        host->h_version    = ni->version;
+        host->h_proto      = ni->protocol;
        host->h_rpcclnt    = NULL;
        mutex_init(&host->h_mutex);
        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
        host->h_state      = 0;                 /* pseudo NSM state */
        host->h_nsmstate   = 0;                 /* real NSM state */
        host->h_nsmhandle  = nsm;
-        host->h_server     = server;
+        host->h_server     = ni->server;
        hlist_add_head(&host->h_hash, chain);
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
        INIT_LIST_HEAD(&host->h_reclaim);
        nrhosts++;
+        nlm_display_address((struct sockaddr *)&host->h_addr,
+                                host->h_addrbuf, sizeof(host->h_addrbuf));
+        nlm_display_address((struct sockaddr *)&host->h_srcaddr,
+                                host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
+        dprintk("lockd: nlm_lookup_host created host %s\n",
+                        host->h_name);
 out:
        mutex_unlock(&nlm_host_mutex);
        return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
        kfree(host);
 }
-/*
+/**
- * Find an NLM server handle in the cache. If there is none, create it.
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
 */
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
-                                     int proto, u32 version,
+                                     const size_t salen,
-                                     const char *hostname,
+                                     const unsigned short protocol,
-                                     unsigned int hostname_len)
+                                     const u32 version, const char *hostname)
 {
-        struct sockaddr_in ssin = {0};
+        const struct sockaddr source = {
+                .sa_family      = AF_UNSPEC,
-        return nlm_lookup_host(0, sin, proto, version,
+        };
-                               hostname, hostname_len, &ssin);
+        struct nlm_lookup_host_info ni = {
+                .server         = 0,
+                .sap            = sap,
+                .salen          = salen,
+                .protocol       = protocol,
+                .version        = version,
+                .hostname       = hostname,
+                .hostname_len   = strlen(hostname),
+                .src_sap        = &source,
+                .src_len        = sizeof(source),
+        };
+        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+                        (hostname ? hostname : "<none>"), version,
+                        (protocol == IPPROTO_UDP ? "udp" : "tcp"));
+        return nlm_lookup_host(&ni);
 }
-/*
+/**
- * Find an NLM client handle in the cache. If there is none, create it.
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request.  If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses.  The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
 */
-struct nlm_host *
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
+                                    const char *hostname,
-                        const char *hostname, unsigned int hostname_len)
+                                    const size_t hostname_len)
 {
-        struct sockaddr_in ssin = {0};
+        struct sockaddr_in sin = {
+                .sin_family     = AF_INET,
+        };
+        struct sockaddr_in6 sin6 = {
+                .sin6_family    = AF_INET6,
+        };
+        struct nlm_lookup_host_info ni = {
+                .server         = 1,
+                .sap            = svc_addr(rqstp),
+                .salen          = rqstp->rq_addrlen,
+                .protocol       = rqstp->rq_prot,
+                .version        = rqstp->rq_vers,
+                .hostname       = hostname,
+                .hostname_len   = hostname_len,
+                .src_len        = rqstp->rq_addrlen,
+        };
+        dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+                        (int)hostname_len, hostname, rqstp->rq_vers,
+                        (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+        switch (ni.sap->sa_family) {
+        case AF_INET:
+                sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+                ni.src_sap = (struct sockaddr *)&sin;
+                break;
+        case AF_INET6:
+                ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+                ni.src_sap = (struct sockaddr *)&sin6;
+                break;
+        default:
+                return NULL;
+        }
-        ssin.sin_addr = rqstp->rq_daddr.addr;
+        return nlm_lookup_host(&ni);
-        return nlm_lookup_host(1, svc_addr_in(rqstp),
-                               rqstp->rq_prot, rqstp->rq_vers,
-                               hostname, hostname_len, &ssin);
 }
 /*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
-        dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
+        dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
-                        NIPQUAD(host->h_saddr.sin_addr),
+                        host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
-                        NIPQUAD(host->h_addr.sin_addr));
        /* Lock host handle */
        mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
                if (time_after_eq(jiffies, host->h_nextrebind)) {
                        rpc_force_rebind(clnt);
                        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
-                        dprintk("lockd: next rebind in %ld jiffies\n",
+                        dprintk("lockd: next rebind in %lu jiffies\n",
                                        host->h_nextrebind - jiffies);
                }
        } else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
                };
                struct rpc_create_args args = {
                        .protocol       = host->h_proto,
-                        .address        = (struct sockaddr *)&host->h_addr,
+                        .address        = nlm_addr(host),
-                        .addrsize       = sizeof(host->h_addr),
+                        .addrsize       = host->h_addrlen,
-                        .saddress       = (struct sockaddr *)&host->h_saddr,
+                        .saddress       = nlm_srcaddr(host),
                        .timeout        = &timeparms,
                        .servername     = host->h_name,
                        .program        = &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
        struct nsm_handle *nsm;
        struct nlm_host *host;
-        dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
+        nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
-                        hostname, NIPQUAD(sin->sin_addr));
+                        hostname, hostname_len, 0);
+        if (nsm == NULL) {
-        /* Find the NSM handle for this peer */
+                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-        if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+                                hostname_len, hostname);
                return;
+        }
+        dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
+                        hostname_len, hostname, nsm->sm_addrbuf);
        /* When reclaiming locks on this peer, make sure that
         * we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
 static LIST_HEAD(nsm_handles);
 static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
-__nsm_find(const struct sockaddr_in *sin,
+                                   const size_t salen,
-                const char *hostname, unsigned int hostname_len,
+                                   const char *hostname,
-                int create)
+                                   const size_t hostname_len,
+                                   const int create)
 {
        struct nsm_handle *nsm = NULL;
        struct nsm_handle *pos;
-        if (!sin)
+        if (!sap)
                return NULL;
        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
                if (printk_ratelimit()) {
                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
                                            "in NFS lock request\n",
-                                hostname_len, hostname);
+                                (int)hostname_len, hostname);
                }
                return NULL;
        }
@@ -489,7 +657,7 @@ retry:
                        if (strlen(pos->sm_name) != hostname_len
                         || memcmp(pos->sm_name, hostname, hostname_len))
                                continue;
-                } else if (!nlm_cmp_addr(&pos->sm_addr, sin))
+                } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
                        continue;
                atomic_inc(&pos->sm_count);
                kfree(nsm);
@@ -509,10 +677,13 @@ retry:
        if (nsm == NULL)
                return NULL;
-        nsm->sm_addr = *sin;
+        memcpy(nsm_addr(nsm), sap, salen);
+        nsm->sm_addrlen = salen;
        nsm->sm_name = (char *) (nsm + 1);
        memcpy(nsm->sm_name, hostname, hostname_len);
        nsm->sm_name[hostname_len] = '\0';
+        nlm_display_address((struct sockaddr *)&nsm->sm_addr,
+                                nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
        atomic_set(&nsm->sm_count, 1);
        goto retry;
@@ -521,13 +692,6 @@ found:
        return nsm;
 }
-static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname,
-         unsigned int hostname_len)
-{
-        return __nsm_find(sin, hostname, hostname_len, 1);
-}
 /*
 * Release an NSM handle
 */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b11..4e7e958e8f67 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        memset(&args, 0, sizeof(args));
        args.mon_name = nsm->sm_name;
-        args.addr = nsm->sm_addr.sin_addr.s_addr;
+        args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
        args.prog = NLM_PROGRAM;
        args.vers = 3;
        args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5bd9bf0fa9df..c631a83931ce 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int             nlmsvc_users;
 static struct task_struct       *nlmsvc_task;
 static struct svc_rqst          *nlmsvc_rqst;
-int                             nlmsvc_grace_period;
 unsigned long                   nlmsvc_timeout;
 /*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
                return nlm_timeout * 5 * HZ;
 }
-unsigned long get_nfs_grace_period(void)
+static struct lock_manager lockd_manager = {
-{
+};
-        unsigned long lockdgrace = get_lockd_grace_period();
-        unsigned long nfsdgrace = 0;
-        if (nlmsvc_ops)
-                nfsdgrace = nlmsvc_ops->get_grace_period();
-        return max(lockdgrace, nfsdgrace);
-}
-EXPORT_SYMBOL(get_nfs_grace_period);
-static unsigned long set_grace_period(void)
+static void grace_ender(struct work_struct *not_used)
 {
-        nlmsvc_grace_period = 1;
+        locks_end_grace(&lockd_manager);
-        return get_nfs_grace_period() + jiffies;
 }
-static inline void clear_grace_period(void)
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+static void set_grace_period(void)
 {
-        nlmsvc_grace_period = 0;
+        unsigned long grace_period = get_lockd_grace_period();
+        locks_start_grace(&lockd_manager);
+        cancel_delayed_work_sync(&grace_period_end);
+        schedule_delayed_work(&grace_period_end, grace_period);
 }
 /*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
 {
        int             err = 0, preverr = 0;
        struct svc_rqst *rqstp = vrqstp;
-        unsigned long grace_period_expire;
        /* try_to_freeze() is called from svc_recv() */
        set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
                nlm_timeout = LOCKD_DFLT_TIMEO;
        nlmsvc_timeout = nlm_timeout * HZ;
-        grace_period_expire = set_grace_period();
+        set_grace_period();
        /*
         * The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
                        flush_signals(current);
                        if (nlmsvc_ops) {
                                nlmsvc_invalidate_all();
-                                grace_period_expire = set_grace_period();
+                                set_grace_period();
                        }
                        continue;
                }
-                /*
+                timeout = nlmsvc_retry_blocked();
-                 * Retry any blocked locks that have been notified by
-                 * the VFS. Don't do this during grace period.
-                 * (Theoretically, there shouldn't even be blocked locks
-                 * during grace period).
-                 */
-                if (!nlmsvc_grace_period) {
-                        timeout = nlmsvc_retry_blocked();
-                } else if (time_before(grace_period_expire, jiffies))
-                        clear_grace_period();
                /*
                 * Find a socket with data available and call its
@@ -195,6 +180,7 @@ lockd(void *vrqstp)
                svc_process(rqstp);
        }
        flush_signals(current);
+        cancel_delayed_work_sync(&grace_period_end);
        if (nlmsvc_ops)
                nlmsvc_invalidate_all();
        nlm_shutdown_hosts();
@@ -203,25 +189,28 @@ lockd(void *vrqstp)
 }
 /*
- * Make any sockets that are needed but not present.
+ * Ensure there are active UDP and TCP listeners for lockd.
- * If nlm_udpport or nlm_tcpport were set as module
+ *
- * options, make those sockets unconditionally
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
 */
-static int make_socks(struct svc_serv *serv, int proto)
+static int make_socks(struct svc_serv *serv)
 {
        static int warned;
        struct svc_xprt *xprt;
        int err = 0;
-        if (proto == IPPROTO_UDP || nlm_udpport) {
+        xprt = svc_find_xprt(serv, "udp", 0, 0);
-                xprt = svc_find_xprt(serv, "udp", 0, 0);
+        if (!xprt)
-                if (!xprt)
+                err = svc_create_xprt(serv, "udp", nlm_udpport,
-                        err = svc_create_xprt(serv, "udp", nlm_udpport,
+                                      SVC_SOCK_DEFAULTS);
-                                              SVC_SOCK_DEFAULTS);
+        else
-                else
+                svc_xprt_put(xprt);
-                        svc_xprt_put(xprt);
+        if (err >= 0) {
-        }
-        if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
                xprt = svc_find_xprt(serv, "tcp", 0, 0);
                if (!xprt)
                        err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
 /*
 * Bring up the lockd process if it's not already up.
 */
-int
+int lockd_up(void)
-lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 {
        struct svc_serv *serv;
        int             error = 0;
@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
        /*
         * Check whether we're already up and running.
         */
-        if (nlmsvc_rqst) {
+        if (nlmsvc_rqst)
-                if (proto)
-                        error = make_socks(nlmsvc_rqst->rq_server, proto);
                goto out;
-        }
        /*
         * Sanity check: if there's no pid,
@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
        error = -ENOMEM;
-        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
                goto out;
        }
-        if ((error = make_socks(serv, proto)) < 0)
+        error = make_socks(serv);
+        if (error < 0)
                goto destroy_and_out;
        /*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 399444639337..014f6ce48172 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -83,17 +83,11 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: TEST4        called\n");
        resp->cookie = argp->cookie;
-        /* Don't accept test requests during grace period */
-        if (nlmsvc_grace_period) {
-                resp->status = nlm_lck_denied_grace_period;
-                return rc;
-        }
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -116,18 +110,12 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: LOCK          called\n");
        resp->cookie = argp->cookie;
-        /* Don't accept new lock requests during grace period */
-        if (nlmsvc_grace_period && !argp->reclaim) {
-                resp->status = nlm_lck_denied_grace_period;
-                return rc;
-        }
        /* Obtain client and file */
        if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Now try to lock the file */
        resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
-                                        argp->block, &argp->cookie);
+                                        argp->block, &argp->cookie,
+                                        argp->reclaim);
        if (resp->status == nlm_drop_reply)
                rc = rpc_drop_reply;
        else
@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept requests during grace period */
-        if (nlmsvc_grace_period) {
+        if (locks_in_grace()) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept new lock requests during grace period */
-        if (nlmsvc_grace_period) {
+        if (locks_in_grace()) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        dprintk("lockd: GRANTED       called\n");
-        resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+        resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
        dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
        return rpc_success;
 }
@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept new lock requests during grace period */
-        if (nlmsvc_grace_period && !argp->reclaim) {
+        if (locks_in_grace() && !argp->reclaim) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept requests during grace period */
-        if (nlmsvc_grace_period) {
+        if (locks_in_grace()) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 {
        struct sockaddr_in      saddr;
-        memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
        dprintk("lockd: SM_NOTIFY     called\n");
-        if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
-         || ntohs(saddr.sin_port) >= 1024) {
+        if (!nlm_privileged_requester(rqstp)) {
                char buf[RPC_MAX_ADDRBUFLEN];
                printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
                                svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf0d5c2c318d..6063a8e4b9f3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
 __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
            struct nlm_host *host, struct nlm_lock *lock, int wait,
-            struct nlm_cookie *cookie)
+            struct nlm_cookie *cookie, int reclaim)
 {
        struct nlm_block        *block = NULL;
        int                     error;
@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                goto out;
        }
+        if (locks_in_grace() && !reclaim) {
+                ret = nlm_lck_denied_grace_period;
+                goto out;
+        }
+        if (reclaim && !locks_in_grace()) {
+                ret = nlm_lck_denied_grace_period;
+                goto out;
+        }
        if (!wait)
                lock->fl.fl_flags &= ~FL_SLEEP;
        error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
                goto out;
        }
+        if (locks_in_grace()) {
+                ret = nlm_lck_denied_grace_period;
+                goto out;
+        }
        error = vfs_test_lock(file->f_file, &lock->fl);
        if (error == FILE_LOCK_DEFERRED) {
                ret = nlmsvc_defer_lock_rqst(rqstp, block);
@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
                                (long long)lock->fl.fl_start,
                                (long long)lock->fl.fl_end);
+        if (locks_in_grace())
+                return nlm_lck_denied_grace_period;
        mutex_lock(&file->f_mutex);
        block = nlmsvc_lookup_block(file, lock);
        mutex_unlock(&file->f_mutex);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76019d2ff72d..548b0bb2b84d 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -112,17 +112,11 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: TEST          called\n");
        resp->cookie = argp->cookie;
-        /* Don't accept test requests during grace period */
-        if (nlmsvc_grace_period) {
-                resp->status = nlm_lck_denied_grace_period;
-                return rc;
-        }
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,18 +140,12 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: LOCK          called\n");
        resp->cookie = argp->cookie;
-        /* Don't accept new lock requests during grace period */
-        if (nlmsvc_grace_period && !argp->reclaim) {
-                resp->status = nlm_lck_denied_grace_period;
-                return rc;
-        }
        /* Obtain client and file */
        if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
                return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
        /* Now try to lock the file */
        resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
-                                               argp->block, &argp->cookie));
+                                               argp->block, &argp->cookie,
+                                               argp->reclaim));
        if (resp->status == nlm_drop_reply)
                rc = rpc_drop_reply;
        else
@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept requests during grace period */
-        if (nlmsvc_grace_period) {
+        if (locks_in_grace()) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept new lock requests during grace period */
-        if (nlmsvc_grace_period) {
+        if (locks_in_grace()) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        dprintk("lockd: GRANTED       called\n");
-        resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+        resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
        dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
        return rpc_success;
 }
@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept new lock requests during grace period */
-        if (nlmsvc_grace_period && !argp->reclaim) {
+        if (locks_in_grace() && !argp->reclaim) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
        resp->cookie = argp->cookie;
        /* Don't accept requests during grace period */
-        if (nlmsvc_grace_period) {
+        if (locks_in_grace()) {
                resp->status = nlm_lck_denied_grace_period;
                return rpc_success;
        }
@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 {
        struct sockaddr_in      saddr;
-        memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
        dprintk("lockd: SM_NOTIFY     called\n");
-        if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
-         || ntohs(saddr.sin_port) >= 1024) {
+        if (!nlm_privileged_requester(rqstp)) {
                char buf[RPC_MAX_ADDRBUFLEN];
                printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
                                svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 198b4e55b373..34c2766e27c7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
 static int
 nlmsvc_match_ip(void *datap, struct nlm_host *host)
 {
-        return nlm_cmp_addr(&host->h_saddr, datap);
+        return nlm_cmp_addr(nlm_srcaddr(host), datap);
 }
 /**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc31..1f226290c67c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
        argp->state = ntohl(*p++);
        /* Preserve the address in network byte order */
        argp->addr = *p++;
-        argp->vers = *p++;
-        argp->proto = *p++;
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c6..50c493a8ad8e 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
        argp->state = ntohl(*p++);
        /* Preserve the address in network byte order */
        argp->addr  = *p++;
-        argp->vers  = *p++;
-        argp->proto = *p++;
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476c..6a09760c5960 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,8 @@ int nfs_callback_up(void)
        mutex_lock(&nfs_callback_mutex);
        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
                goto out;
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+                                AF_INET, NULL);
        ret = -ENOMEM;
        if (!serv)
                goto out_err;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 46763d1cd397..8478fc25daee 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -127,7 +127,7 @@ enum {
        Opt_err
 };
-static match_table_t __initdata tokens = {
+static match_table_t __initconst tokens = {
        {Opt_port, "port=%u"},
        {Opt_rsize, "rsize=%u"},
        {Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9abcd2b329f7..ffb697416cb1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -98,7 +98,7 @@ enum {
        Opt_err
 };
-static match_table_t nfs_mount_option_tokens = {
+static const match_table_t nfs_mount_option_tokens = {
        { Opt_userspace, "bg" },
        { Opt_userspace, "fg" },
        { Opt_userspace, "retry=%s" },
@@ -163,7 +163,7 @@ enum {
        Opt_xprt_err
 };
-static match_table_t nfs_xprt_protocol_tokens = {
+static const match_table_t nfs_xprt_protocol_tokens = {
        { Opt_xprt_udp, "udp" },
        { Opt_xprt_tcp, "tcp" },
        { Opt_xprt_rdma, "rdma" },
@@ -180,7 +180,7 @@ enum {
        Opt_sec_err
 };
-static match_table_t nfs_secflavor_tokens = {
+static const match_table_t nfs_secflavor_tokens = {
        { Opt_sec_none, "none" },
        { Opt_sec_none, "null" },
        { Opt_sec_sys, "sys" },
@@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,
                }
        }
+        if (errors > 0) {
+                dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+                                errors, (errors == 1 ? "" : "s"));
+                if (!sloppy)
+                        return 0;
+        }
        return 1;
 out_nomem:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 33bfcf09db46..9dc036f18356 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1023,7 +1023,7 @@ exp_export(struct nfsctl_export *nxp)
        /* Look up the dentry */
        err = path_lookup(nxp->ex_path, 0, &nd);
        if (err)
-                goto out_unlock;
+                goto out_put_clp;
        err = -EINVAL;
        exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL);
@@ -1090,9 +1090,9 @@ finish:
                exp_put(exp);
        if (fsid_key && !IS_ERR(fsid_key))
                cache_put(&fsid_key->h, &svc_expkey_cache);
-        if (clp)
-                auth_domain_put(clp);
        path_put(&nd.path);
+out_put_clp:
+        auth_domain_put(clp);
 out_unlock:
        exp_writeunlock();
 out:
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 15c6faeec77c..b2786a5f9afe 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
 static struct nlmsvc_binding    nfsd_nlm_ops = {
        .fopen          = nlm_fopen,            /* open file for locking */
        .fclose         = nlm_fclose,           /* close file */
-        .get_grace_period = get_nfs4_grace_period,
 };
 void
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 4d617ea28cfc..9dbd2eb91281 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
                SVCFH_fmt(&argp->fh));
        fh_copy(&resp->fh, &argp->fh);
-        nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+        nfserr = fh_verify(rqstp, &resp->fh, 0,
+                        NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
        if (nfserr)
                RETURN_STATUS(nfserr);
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
        dprintk("nfsd: FSSTAT(3)   %s\n",
                                SVCFH_fmt(&argp->fh));
-        nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+        nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
        fh_put(&argp->fh);
        RETURN_STATUS(nfserr);
 }
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
        resp->f_maxfilesize = ~(u32) 0;
        resp->f_properties = NFS3_FSF_DEFAULT;
-        nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+        nfserr = fh_verify(rqstp, &argp->fh, 0,
+                        NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
        /* Check special features of the file system. May request
         * different read/write sizes for file systems known to have
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab8..54b8b4140c8f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
         * enough space for either:
         */
        alloc = sizeof(struct posix_ace_state_array)
-                + cnt*sizeof(struct posix_ace_state);
+                + cnt*sizeof(struct posix_user_ace_state);
        state->users = kzalloc(alloc, GFP_KERNEL);
        if (!state->users)
                return -ENOMEM;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6e..094747a1227c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
        RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
        WRITE32(OP_CB_RECALL);
-        WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
+        WRITE32(cb_rec->cbr_stateid.si_generation);
+        WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
        WRITE32(cb_rec->cbr_trunc);
        WRITE32(len);
        WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
                .addrsize       = sizeof(addr),
                .timeout        = &timeparms,
                .program        = &cb_program,
+                .prognumber     = cb->cb_prog,
                .version        = nfs_cb_version[1]->number,
                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
        addr.sin_port = htons(cb->cb_port);
        addr.sin_addr.s_addr = htonl(cb->cb_addr);
-        /* Initialize rpc_stat */
-        memset(args.program->stats, 0, sizeof(struct rpc_stat));
        /* Create RPC client */
        client = rpc_create(&args);
        if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2e51adac65de..669461e291ae 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        /* Openowner is now set, so sequence id will get bumped.  Now we need
         * these checks before we do any creates: */
        status = nfserr_grace;
-        if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+        if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
                goto out;
        status = nfserr_no_grace;
-        if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+        if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
                goto out;
        switch (open->op_claim_type) {
@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        __be32 status;
-        if (nfs4_in_grace())
+        if (locks_in_grace())
                return nfserr_grace;
        status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
                             remove->rm_name, remove->rm_namelen);
@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (!cstate->save_fh.fh_dentry)
                return status;
-        if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+        if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
                                        & NFSEXP_NOSUBTREECHECK))
                return nfserr_grace;
        status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        int             slack_bytes;
        __be32          status;
-        status = nfserr_resource;
-        cstate = cstate_alloc();
-        if (cstate == NULL)
-                goto out;
        resp->xbuf = &rqstp->rq_res;
        resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
        resp->tagp = resp->p;
@@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
                goto out;
+        status = nfserr_resource;
+        cstate = cstate_alloc();
+        if (cstate == NULL)
+                goto out;
        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
@@ -957,9 +957,9 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
+        cstate_free(cstate);
 out:
        nfsd4_release_compoundargs(args);
-        cstate_free(cstate);
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1578d7a2667e..0cc7ff5d5ab5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
 static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
 static time_t boot_time;
-static int in_grace = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
                case NFS4_OPEN_CLAIM_NULL:
                        /* Let's not give out any delegations till everyone's
                         * had the chance to reclaim theirs.... */
-                        if (nfs4_in_grace())
+                        if (locks_in_grace())
                                goto out;
                        if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
                                goto out;
@@ -1816,12 +1815,15 @@ out:
        return status;
 }
+struct lock_manager nfsd4_manager = {
+};
 static void
-end_grace(void)
+nfsd4_end_grace(void)
 {
        dprintk("NFSD: end of grace period\n");
        nfsd4_recdir_purge_old();
-        in_grace = 0;
+        locks_end_grace(&nfsd4_manager);
 }
 static time_t
@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
        nfs4_lock_state();
        dprintk("NFSD: laundromat service - starting\n");
-        if (in_grace)
+        if (locks_in_grace())
-                end_grace();
+                nfsd4_end_grace();
        list_for_each_safe(pos, next, &client_lru) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
                return nfserr_bad_stateid;
        else if (ONE_STATEID(stateid) && (flags & RD_STATE))
                return nfs_ok;
-        else if (nfs4_in_grace()) {
+        else if (locks_in_grace()) {
                /* Answer in remaining cases depends on existance of
                 * conflicting state; so we must wait out the grace period. */
                return nfserr_grace;
@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
-        return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
+        return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
                && mandatory_lock(inode);
 }
@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        filp = lock_stp->st_vfs_file;
        status = nfserr_grace;
-        if (nfs4_in_grace() && !lock->lk_reclaim)
+        if (locks_in_grace() && !lock->lk_reclaim)
                goto out;
        status = nfserr_no_grace;
-        if (!nfs4_in_grace() && lock->lk_reclaim)
+        if (!locks_in_grace() && lock->lk_reclaim)
                goto out;
        locks_init_lock(&file_lock);
@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        int error;
        __be32 status;
-        if (nfs4_in_grace())
+        if (locks_in_grace())
                return nfserr_grace;
        if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
        unsigned long grace_time;
        boot_time = get_seconds();
-        grace_time = get_nfs_grace_period();
+        grace_time = get_nfs4_grace_period();
        lease_time = user_lease_time;
-        in_grace = 1;
+        locks_start_grace(&nfsd4_manager);
        printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
               grace_time/HZ);
        laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
        return;
 }
-int
-nfs4_in_grace(void)
-{
-        return in_grace;
-}
 time_t
 nfs4_lease_time(void)
 {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 14ba4d9b2859..afcdf4b76843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
 }
 static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+        DECODE_HEAD;
+        READ_BUF(sizeof(stateid_t));
+        READ32(sid->si_generation);
+        COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+        DECODE_TAIL;
+}
+static __be32
 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
 {
        DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
        DECODE_HEAD;
        close->cl_stateowner = NULL;
-        READ_BUF(4 + sizeof(stateid_t));
+        READ_BUF(4);
        READ32(close->cl_seqid);
-        READ32(close->cl_stateid.si_generation);
+        return nfsd4_decode_stateid(argp, &close->cl_stateid);
-        COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
        DECODE_TAIL;
 }
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 static inline __be32
 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
 {
-        DECODE_HEAD;
+        return nfsd4_decode_stateid(argp, &dr->dr_stateid);
-        READ_BUF(sizeof(stateid_t));
-        READ32(dr->dr_stateid.si_generation);
-        COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
-        DECODE_TAIL;
 }
 static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
        READ32(lock->lk_is_new);
        if (lock->lk_is_new) {
-                READ_BUF(36);
+                READ_BUF(4);
                READ32(lock->lk_new_open_seqid);
-                READ32(lock->lk_new_open_stateid.si_generation);
+                status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+                if (status)
-                COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
+                        return status;
+                READ_BUF(8 + sizeof(clientid_t));
                READ32(lock->lk_new_lock_seqid);
                COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
                READ32(lock->lk_new_owner.len);
                READ_BUF(lock->lk_new_owner.len);
                READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
        } else {
-                READ_BUF(20);
+                status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
-                READ32(lock->lk_old_lock_stateid.si_generation);
+                if (status)
-                COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
+                        return status;
+                READ_BUF(4);
                READ32(lock->lk_old_lock_seqid);
        }
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
        DECODE_HEAD;
        locku->lu_stateowner = NULL;
-        READ_BUF(24 + sizeof(stateid_t));
+        READ_BUF(8);
        READ32(locku->lu_type);
        if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
                goto xdr_error;
        READ32(locku->lu_seqid);
-        READ32(locku->lu_stateid.si_generation);
+        status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
-        COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
+        if (status)
+                return status;
+        READ_BUF(16);
        READ64(locku->lu_offset);
        READ64(locku->lu_length);
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
                READ32(open->op_delegate_type);
                break;
        case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                READ_BUF(sizeof(stateid_t) + 4);
+                status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
-                COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+                if (status)
+                        return status;
+                READ_BUF(4);
                READ32(open->op_fname.len);
                READ_BUF(open->op_fname.len);
                SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
        DECODE_HEAD;
                    
        open_conf->oc_stateowner = NULL;
-        READ_BUF(4 + sizeof(stateid_t));
+        status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
-        READ32(open_conf->oc_req_stateid.si_generation);
+        if (status)
-        COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
+                return status;
+        READ_BUF(4);
        READ32(open_conf->oc_seqid);
                                                        
        DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
        DECODE_HEAD;
                    
        open_down->od_stateowner = NULL;
-        READ_BUF(12 + sizeof(stateid_t));
+        status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
-        READ32(open_down->od_stateid.si_generation);
+        if (status)
-        COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+                return status;
+        READ_BUF(12);
        READ32(open_down->od_seqid);
        READ32(open_down->od_share_access);
        READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
        DECODE_HEAD;
-        READ_BUF(sizeof(stateid_t) + 12);
+        status = nfsd4_decode_stateid(argp, &read->rd_stateid);
-        READ32(read->rd_stateid.si_generation);
+        if (status)
-        COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
+                return status;
+        READ_BUF(12);
        READ64(read->rd_offset);
        READ32(read->rd_length);
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
-        DECODE_HEAD;
+        __be32 status;
-        READ_BUF(sizeof(stateid_t));
-        READ32(setattr->sa_stateid.si_generation);
-        COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
-        if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
-                goto out;
-        DECODE_TAIL;
+        status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+        if (status)
+                return status;
+        return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+                                  &setattr->sa_iattr, &setattr->sa_acl);
 }
 static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
        int len;
        DECODE_HEAD;
-        READ_BUF(sizeof(stateid_opaque_t) + 20);
+        status = nfsd4_decode_stateid(argp, &write->wr_stateid);
-        READ32(write->wr_stateid.si_generation);
+        if (status)
-        COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
+                return status;
+        READ_BUF(16);
        READ64(write->wr_offset);
        READ32(write->wr_stable_how);
        if (write->wr_stable_how > 2)
@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 * Header routine to setup seqid operation replay cache
 */
 #define ENCODE_SEQID_OP_HEAD                                    \
-        __be32 *p;                                              \
        __be32 *save;                                           \
                                                                \
        save = resp->p;
@@ -1950,6 +1962,17 @@ fail:
        return -EINVAL;
 }
+static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+        ENCODE_HEAD;
+        RESERVE_SPACE(sizeof(stateid_t));
+        WRITE32(sid->si_generation);
+        WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+        ADJUST_ARGS();
+}
 static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 {
        ENCODE_SEQID_OP_HEAD;
-        if (!nfserr) {
+        if (!nfserr)
-                RESERVE_SPACE(sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &close->cl_stateid);
-                WRITE32(close->cl_stateid.si_generation);
-                WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
-                ADJUST_ARGS();
-        }
        ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
        return nfserr;
 }
@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 {
        ENCODE_SEQID_OP_HEAD;
-        if (!nfserr) {
+        if (!nfserr)
-                RESERVE_SPACE(4 + sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
-                WRITE32(lock->lk_resp_stateid.si_generation);
+        else if (nfserr == nfserr_denied)
-                WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
-                ADJUST_ARGS();
-        } else if (nfserr == nfserr_denied)
                nfsd4_encode_lock_denied(resp, &lock->lk_denied);
        ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 {
        ENCODE_SEQID_OP_HEAD;
-        if (!nfserr) {
+        if (!nfserr)
-                RESERVE_SPACE(sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &locku->lu_stateid);
-                WRITE32(locku->lu_stateid.si_generation);
-                WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
-                ADJUST_ARGS();
-        }
-                                        
        ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
        return nfserr;
 }
@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
+        ENCODE_HEAD;
        ENCODE_SEQID_OP_HEAD;
        if (nfserr)
                goto out;
-        RESERVE_SPACE(36 + sizeof(stateid_t));
+        nfsd4_encode_stateid(resp, &open->op_stateid);
-        WRITE32(open->op_stateid.si_generation);
+        RESERVE_SPACE(40);
-        WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
        WRITECINFO(open->op_cinfo);
        WRITE32(open->op_rflags);
        WRITE32(2);
@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
        case NFS4_OPEN_DELEGATE_NONE:
                break;
        case NFS4_OPEN_DELEGATE_READ:
-                RESERVE_SPACE(20 + sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
-                WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+                RESERVE_SPACE(20);
                WRITE32(open->op_recall);
                /*
@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
                ADJUST_ARGS();
                break;
        case NFS4_OPEN_DELEGATE_WRITE:
-                RESERVE_SPACE(32 + sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
-                WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+                RESERVE_SPACE(32);
                WRITE32(0);
                /*
@@ -2195,13 +2208,9 @@ static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
        ENCODE_SEQID_OP_HEAD;
-                                        
-        if (!nfserr) {
+        if (!nfserr)
-                RESERVE_SPACE(sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
-                WRITE32(oc->oc_resp_stateid.si_generation);
-                WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
-                ADJUST_ARGS();
-        }
        ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
        return nfserr;
@@ -2211,13 +2220,9 @@ static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
        ENCODE_SEQID_OP_HEAD;
-                                        
-        if (!nfserr) {
+        if (!nfserr)
-                RESERVE_SPACE(sizeof(stateid_t));
+                nfsd4_encode_stateid(resp, &od->od_stateid);
-                WRITE32(od->od_stateid.si_generation);
-                WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
-                ADJUST_ARGS();
-        }
        ENCODE_SEQID_OP_TAIL(od->od_stateowner);
        return nfserr;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c53e65f8f3a2..97543df58242 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
                        return -EINVAL;
                err = nfsd_create_serv();
                if (!err) {
-                        int proto = 0;
+                        err = svc_addsock(nfsd_serv, fd, buf);
-                        err = svc_addsock(nfsd_serv, fd, buf, &proto);
                        if (err >= 0) {
-                                err = lockd_up(proto);
+                                err = lockd_up();
                                if (err < 0)
                                        svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
                        }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ea37c96f0445..cd25d91895a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
        if (error)
                goto out;
-        if (!(access & NFSD_MAY_LOCK)) {
+        /*
-                /*
+         * pseudoflavor restrictions are not enforced on NLM,
-                 * pseudoflavor restrictions are not enforced on NLM,
+         * which clients virtually always use auth_sys for,
-                 * which clients virtually always use auth_sys for,
+         * even while using RPCSEC_GSS for NFS.
-                 * even while using RPCSEC_GSS for NFS.
+         */
-                 */
+        if (access & NFSD_MAY_LOCK)
-                error = check_nfsd_access(exp, rqstp);
+                goto skip_pseudoflavor_check;
-                if (error)
+        /*
-                        goto out;
+         * Clients may expect to be able to use auth_sys during mount,
-        }
+         * even if they use gss for everything else; see section 2.3.2
+         * of rfc 2623.
+         */
+        if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+                        && exp->ex_path.dentry == dentry)
+                goto skip_pseudoflavor_check;
+        error = check_nfsd_access(exp, rqstp);
+        if (error)
+                goto out;
+skip_pseudoflavor_check:
        /* Finally, check access permissions. */
        error = nfsd_permission(rqstp, exp, dentry, access);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0766f95d236a..5cffeca7acef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
        dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
        fh_copy(&resp->fh, &argp->fh);
-        nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+        nfserr = fh_verify(rqstp, &resp->fh, 0,
+                        NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
        return nfsd_return_attrs(nfserr, resp);
 }
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
        dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
-        nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+        nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+                        NFSD_MAY_BYPASS_GSS_ON_ROOT);
        fh_put(&argp->fh);
        return nfserr;
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80292ff5e924..59eeb46f82c5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
        atomic_set(&nfsd_busy, 0);
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+                                      AF_INET,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
                err = -ENOMEM;
@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = lockd_up(IPPROTO_UDP);
+        error = svc_create_xprt(nfsd_serv, "udp", port,
-        if (error >= 0) {
-                error = svc_create_xprt(nfsd_serv, "udp", port,
                                        SVC_SOCK_DEFAULTS);
-                if (error < 0)
-                        lockd_down();
-        }
        if (error < 0)
                return error;
-        error = lockd_up(IPPROTO_TCP);
+        error = svc_create_xprt(nfsd_serv, "tcp", port,
-        if (error >= 0) {
-                error = svc_create_xprt(nfsd_serv, "tcp", port,
                                        SVC_SOCK_DEFAULTS);
-                if (error < 0)
-                        lockd_down();
-        }
        if (error < 0)
                return error;
+        error = lockd_up();
+        if (error < 0)
+                return error;
        return 0;
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18060bed5267..aa1d0d6489a1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
        spinlock_t              pb_lock;
 } ____cacheline_aligned_in_smp;
-static struct raparms *         raparml;
 #define RAPARM_HASH_BITS        4
 #define RAPARM_HASH_SIZE        (1<<RAPARM_HASH_BITS)
 #define RAPARM_HASH_MASK        (RAPARM_HASH_SIZE-1)
@@ -1866,9 +1865,9 @@ out:
 * N.B. After this call fhp needs an fh_put
 */
 __be32
-nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-        __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+        __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
        if (!err && vfs_statfs(fhp->fh_dentry,stat))
                err = nfserr_io;
        return err;
@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 void
 nfsd_racache_shutdown(void)
 {
-        if (!raparml)
+        struct raparms *raparm, *last_raparm;
-                return;
+        unsigned int i;
        dprintk("nfsd: freeing readahead buffers.\n");
-        kfree(raparml);
-        raparml = NULL;
+        for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+                raparm = raparm_hash[i].pb_head;
+                while(raparm) {
+                        last_raparm = raparm;
+                        raparm = raparm->p_next;
+                        kfree(last_raparm);
+                }
+                raparm_hash[i].pb_head = NULL;
+        }
 }
 /*
 * Initialize readahead param cache
@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
        int     i;
        int     j = 0;
        int     nperbucket;
+        struct raparms **raparm = NULL;
-        if (raparml)
+        if (raparm_hash[0].pb_head)
                return 0;
-        if (cache_size < 2*RAPARM_HASH_SIZE)
+        nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
-                cache_size = 2*RAPARM_HASH_SIZE;
+        if (nperbucket < 2)
-        raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
+                nperbucket = 2;
+        cache_size = nperbucket * RAPARM_HASH_SIZE;
-        if (!raparml) {
-                printk(KERN_WARNING
-                        "nfsd: Could not allocate memory read-ahead cache.\n");
-                return -ENOMEM;
-        }
        dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
-        for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
-                raparm_hash[i].pb_head = NULL;
+        for (i = 0; i < RAPARM_HASH_SIZE; i++) {
                spin_lock_init(&raparm_hash[i].pb_lock);
-        }
-        nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+                raparm = &raparm_hash[i].pb_head;
-        for (i = 0; i < cache_size - 1; i++) {
+                for (j = 0; j < nperbucket; j++) {
-                if (i % nperbucket == 0)
+                        *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
-                        raparm_hash[j++].pb_head = raparml + i;
+                        if (!*raparm)
-                if (i % nperbucket < nperbucket-1)
+                                goto out_nomem;
-                        raparml[i].p_next = raparml + i + 1;
+                        raparm = &(*raparm)->p_next;
+                }
+                *raparm = NULL;
        }
        nfsdstats.ra_size = cache_size;
        return 0;
+out_nomem:
+        dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+        nfsd_racache_shutdown();
+        return -ENOMEM;
 }
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e1781c8b1650..9e8a95be7a1e 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
        // TODO: Consider moving this lot to a separate function! (AIA)
 handle_name:
   {
-        struct dentry *real_dent, *new_dent;
        MFT_RECORD *m;
        ntfs_attr_search_ctx *ctx;
        ntfs_inode *ni = NTFS_I(dent_inode);
@@ -255,93 +254,9 @@ handle_name:
        }
        nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
-        /*
+        dent = d_add_ci(dent, dent_inode, &nls_name);
-         * Note: No need for dent->d_lock lock as i_mutex is held on the
-         * parent inode.
-         */
-        /* Does a dentry matching the nls_name exist already? */
-        real_dent = d_lookup(dent->d_parent, &nls_name);
-        /* If not, create it now. */
-        if (!real_dent) {
-                real_dent = d_alloc(dent->d_parent, &nls_name);
-                kfree(nls_name.name);
-                if (!real_dent) {
-                        err = -ENOMEM;
-                        goto err_out;
-                }
-                new_dent = d_splice_alias(dent_inode, real_dent);
-                if (new_dent)
-                        dput(real_dent);
-                else
-                        new_dent = real_dent;
-                ntfs_debug("Done.  (Created new dentry.)");
-                return new_dent;
-        }
        kfree(nls_name.name);
-        /* Matching dentry exists, check if it is negative. */
+        return dent;
-        if (real_dent->d_inode) {
-                if (unlikely(real_dent->d_inode != dent_inode)) {
-                        /* This can happen because bad inodes are unhashed. */
-                        BUG_ON(!is_bad_inode(dent_inode));
-                        BUG_ON(!is_bad_inode(real_dent->d_inode));
-                }
-                /*
-                 * Already have the inode and the dentry attached, decrement
-                 * the reference count to balance the ntfs_iget() we did
-                 * earlier on.  We found the dentry using d_lookup() so it
-                 * cannot be disconnected and thus we do not need to worry
-                 * about any NFS/disconnectedness issues here.
-                 */
-                iput(dent_inode);
-                ntfs_debug("Done.  (Already had inode and dentry.)");
-                return real_dent;
-        }
-        /*
-         * Negative dentry: instantiate it unless the inode is a directory and
-         * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
-         * in which case d_move() that in place of the found dentry.
-         */
-        if (!S_ISDIR(dent_inode->i_mode)) {
-                /* Not a directory; everything is easy. */
-                d_instantiate(real_dent, dent_inode);
-                ntfs_debug("Done.  (Already had negative file dentry.)");
-                return real_dent;
-        }
-        spin_lock(&dcache_lock);
-        if (list_empty(&dent_inode->i_dentry)) {
-                /*
-                 * Directory without a 'disconnected' dentry; we need to do
-                 * d_instantiate() by hand because it takes dcache_lock which
-                 * we already hold.
-                 */
-                list_add(&real_dent->d_alias, &dent_inode->i_dentry);
-                real_dent->d_inode = dent_inode;
-                spin_unlock(&dcache_lock);
-                security_d_instantiate(real_dent, dent_inode);
-                ntfs_debug("Done.  (Already had negative directory dentry.)");
-                return real_dent;
-        }
-        /*
-         * Directory with a 'disconnected' dentry; get a reference to the
-         * 'disconnected' dentry.
-         */
-        new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
-                        d_alias);
-        dget_locked(new_dent);
-        spin_unlock(&dcache_lock);
-        /* Do security vodoo. */
-        security_d_instantiate(real_dent, dent_inode);
-        /* Move new_dent in place of real_dent. */
-        d_move(new_dent, real_dent);
-        /* Balance the ntfs_iget() we did above. */
-        iput(dent_inode);
-        /* Throw away real_dent. */
-        dput(real_dent);
-        /* Use new_dent as the actual dentry. */
-        ntfs_debug("Done.  (Already had negative, disconnected directory "
-                        "dentry.)");
-        return new_dent;
 eio_err_out:
        ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
 * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
 * file since it was last opened.  I think the names speak for themselves but
 * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * documentation: http://www.linux-ntfs.org/
 */
 enum {
        USN_REASON_DATA_OVERWRITE       = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
 * Source info flags (32-bit).  Information about the source of the change(s)
 * to the file.  For detailed descriptions of what these mean, see the Linux
 * NTFS project NTFS documentation:
- *      http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ *      http://www.linux-ntfs.org/
 */
 enum {
        USN_SOURCE_DATA_MANAGEMENT        = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fdb..589dcdfdfe3c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
        symlink.o               \
        sysfile.o               \
        uptodate.o              \
-        ver.o
+        ver.o                   \
+        xattr.o
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e068..0cc2deb9394c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
 #include "buffer_head_io.h"
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+        /*
+         * last_eb_blk is the block number of the right most leaf extent
+         * block.  Most on-disk structures containing an extent tree store
+         * this value for fast access.  The ->eo_set_last_eb_blk() and
+         * ->eo_get_last_eb_blk() operations access this value.  They are
+         *  both required.
+         */
+        void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+                                   u64 blkno);
+        u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+        /*
+         * The on-disk structure usually keeps track of how many total
+         * clusters are stored in this extent tree.  This function updates
+         * that value.  new_clusters is the delta, and must be
+         * added to the total.  Required.
+         */
+        void (*eo_update_clusters)(struct inode *inode,
+                                   struct ocfs2_extent_tree *et,
+                                   u32 new_clusters);
+        /*
+         * If ->eo_insert_check() exists, it is called before rec is
+         * inserted into the extent tree.  It is optional.
+         */
+        int (*eo_insert_check)(struct inode *inode,
+                               struct ocfs2_extent_tree *et,
+                               struct ocfs2_extent_rec *rec);
+        int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+        /*
+         * --------------------------------------------------------------
+         * The remaining are internal to ocfs2_extent_tree and don't have
+         * accessor functions
+         */
+        /*
+         * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+         * It is required.
+         */
+        void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+        /*
+         * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+         * it exists.  If it does not, et->et_max_leaf_clusters is set
+         * to 0 (unlimited).  Optional.
+         */
+        void (*eo_fill_max_leaf_clusters)(struct inode *inode,
+                                          struct ocfs2_extent_tree *et);
+};
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                         u64 blkno);
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+                                         struct ocfs2_extent_tree *et,
+                                         u32 clusters);
+static int ocfs2_dinode_insert_check(struct inode *inode,
+                                     struct ocfs2_extent_tree *et,
+                                     struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+                                     struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+        .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
+        .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
+        .eo_update_clusters     = ocfs2_dinode_update_clusters,
+        .eo_insert_check        = ocfs2_dinode_insert_check,
+        .eo_sanity_check        = ocfs2_dinode_sanity_check,
+        .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
+};
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                         u64 blkno)
+{
+        struct ocfs2_dinode *di = et->et_object;
+        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dinode *di = et->et_object;
+        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        return le64_to_cpu(di->i_last_eb_blk);
+}
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+                                         struct ocfs2_extent_tree *et,
+                                         u32 clusters)
+{
+        struct ocfs2_dinode *di = et->et_object;
+        le32_add_cpu(&di->i_clusters, clusters);
+        spin_lock(&OCFS2_I(inode)->ip_lock);
+        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+        spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+static int ocfs2_dinode_insert_check(struct inode *inode,
+                                     struct ocfs2_extent_tree *et,
+                                     struct ocfs2_extent_rec *rec)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+                        (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+                        "Device %s, asking for sparse allocation: inode %llu, "
+                        "cpos %u, clusters %u\n",
+                        osb->dev_str,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        rec->e_cpos,
+                        OCFS2_I(inode)->ip_clusters);
+        return 0;
+}
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+                                     struct ocfs2_extent_tree *et)
+{
+        int ret = 0;
+        struct ocfs2_dinode *di;
+        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        di = et->et_object;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ret = -EIO;
+                ocfs2_error(inode->i_sb,
+                        "Inode %llu has invalid path root",
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        }
+        return ret;
+}
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_dinode *di = et->et_object;
+        et->et_root_el = &di->id2.i_list;
+}
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_xattr_value_root *xv = et->et_object;
+        et->et_root_el = &xv->xr_list;
+}
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                              u64 blkno)
+{
+        struct ocfs2_xattr_value_root *xv =
+                (struct ocfs2_xattr_value_root *)et->et_object;
+        xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_xattr_value_root *xv =
+                (struct ocfs2_xattr_value_root *) et->et_object;
+        return le64_to_cpu(xv->xr_last_eb_blk);
+}
+static void ocfs2_xattr_value_update_clusters(struct inode *inode,
+                                              struct ocfs2_extent_tree *et,
+                                              u32 clusters)
+{
+        struct ocfs2_xattr_value_root *xv =
+                (struct ocfs2_xattr_value_root *)et->et_object;
+        le32_add_cpu(&xv->xr_clusters, clusters);
+}
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+        .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
+        .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
+        .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
+        .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
+};
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_xattr_block *xb = et->et_object;
+        et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
+                                                    struct ocfs2_extent_tree *et)
+{
+        et->et_max_leaf_clusters =
+                ocfs2_clusters_for_bytes(inode->i_sb,
+                                         OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                             u64 blkno)
+{
+        struct ocfs2_xattr_block *xb = et->et_object;
+        struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+        xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        struct ocfs2_xattr_block *xb = et->et_object;
+        struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+        return le64_to_cpu(xt->xt_last_eb_blk);
+}
+static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
+                                             struct ocfs2_extent_tree *et,
+                                             u32 clusters)
+{
+        struct ocfs2_xattr_block *xb = et->et_object;
+        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+        .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
+        .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
+        .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
+        .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
+        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+                                     struct inode *inode,
+                                     struct buffer_head *bh,
+                                     void *obj,
+                                     struct ocfs2_extent_tree_operations *ops)
+{
+        et->et_ops = ops;
+        et->et_root_bh = bh;
+        if (!obj)
+                obj = (void *)bh->b_data;
+        et->et_object = obj;
+        et->et_ops->eo_fill_root_el(et);
+        if (!et->et_ops->eo_fill_max_leaf_clusters)
+                et->et_max_leaf_clusters = 0;
+        else
+                et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+}
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+                                   struct inode *inode,
+                                   struct buffer_head *bh)
+{
+        __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+}
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+                                       struct inode *inode,
+                                       struct buffer_head *bh)
+{
+        __ocfs2_init_extent_tree(et, inode, bh, NULL,
+                                 &ocfs2_xattr_tree_et_ops);
+}
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+                                        struct inode *inode,
+                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_root *xv)
+{
+        __ocfs2_init_extent_tree(et, inode, bh, xv,
+                                 &ocfs2_xattr_value_et_ops);
+}
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                            u64 new_last_eb_blk)
+{
+        et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+        return et->et_ops->eo_get_last_eb_blk(et);
+}
+static inline void ocfs2_et_update_clusters(struct inode *inode,
+                                            struct ocfs2_extent_tree *et,
+                                            u32 clusters)
+{
+        et->et_ops->eo_update_clusters(inode, et, clusters);
+}
+static inline int ocfs2_et_insert_check(struct inode *inode,
+                                        struct ocfs2_extent_tree *et,
+                                        struct ocfs2_extent_rec *rec)
+{
+        int ret = 0;
+        if (et->et_ops->eo_insert_check)
+                ret = et->et_ops->eo_insert_check(inode, et, rec);
+        return ret;
+}
+static inline int ocfs2_et_sanity_check(struct inode *inode,
+                                        struct ocfs2_extent_tree *et)
+{
+        int ret = 0;
+        if (et->et_ops->eo_sanity_check)
+                ret = et->et_ops->eo_sanity_check(inode, et);
+        return ret;
+}
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
                                         struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 }
 /*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_extent_list *el = &di->id2.i_list;
-        return ocfs2_new_path(di_bh, el);
-}
-/*
 * Convenience function to journal all components in a path.
 */
 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
 */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
-                           struct ocfs2_dinode *fe)
+                           struct ocfs2_extent_tree *et)
 {
        int retval;
-        struct ocfs2_extent_list *el;
+        struct ocfs2_extent_list *el = NULL;
        struct ocfs2_extent_block *eb;
        struct buffer_head *eb_bh = NULL;
+        u64 last_eb_blk = 0;
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        el = et->et_root_el;
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
-                retval = -EIO;
-                goto bail;
-        }
-        if (fe->i_last_eb_blk) {
+        if (last_eb_blk) {
-                retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                retval = ocfs2_read_block(inode, last_eb_blk,
-                                          &eb_bh, OCFS2_BH_CACHED, inode);
+                                          &eb_bh);
                if (retval < 0) {
                        mlog_errno(retval);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
                el = &eb->h_list;
-        } else
+        }
-                el = &fe->id2.i_list;
        BUG_ON(el->l_tree_depth != 0);
        retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 bail:
-        if (eb_bh)
+        brelse(eb_bh);
-                brelse(eb_bh);
        mlog_exit(retval);
        return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 bail:
        if (status < 0) {
                for(i = 0; i < wanted; i++) {
-                        if (bhs[i])
+                        brelse(bhs[i]);
-                                brelse(bhs[i]);
                        bhs[i] = NULL;
                }
        }
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 static int ocfs2_add_branch(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
-                            struct buffer_head *fe_bh,
+                            struct ocfs2_extent_tree *et,
                            struct buffer_head *eb_bh,
                            struct buffer_head **last_eb_bh,
                            struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        u64 next_blkno, new_last_eb_blk;
        struct buffer_head *bh;
        struct buffer_head **new_eb_bhs = NULL;
-        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *eb_el;
        struct ocfs2_extent_list  *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        BUG_ON(!last_eb_bh || !*last_eb_bh);
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
        if (eb_bh) {
                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
                el = &eb->h_list;
        } else
-                el = &fe->id2.i_list;
+                el = et->et_root_el;
        /* we never add a branch to a leaf. */
        BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                mlog_errno(status);
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        }
        /* Link the new branch into the rest of the tree (el will
-         * either be on the fe, or the extent block passed in. */
+         * either be on the root_bh, or the extent block passed in. */
        i = le16_to_cpu(el->l_next_free_rec);
        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        /* fe needs a new last extent block pointer, as does the
         * next_leaf on the previously last-extent-block. */
-        fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+        ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        status = ocfs2_journal_dirty(handle, *last_eb_bh);
        if (status < 0)
                mlog_errno(status);
-        status = ocfs2_journal_dirty(handle, fe_bh);
+        status = ocfs2_journal_dirty(handle, et->et_root_bh);
        if (status < 0)
                mlog_errno(status);
        if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 bail:
        if (new_eb_bhs) {
                for (i = 0; i < new_blocks; i++)
-                        if (new_eb_bhs[i])
+                        brelse(new_eb_bhs[i]);
-                                brelse(new_eb_bhs[i]);
                kfree(new_eb_bhs);
        }
@@ -717,16 +1031,15 @@ bail:
 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                                  handle_t *handle,
                                  struct inode *inode,
-                                  struct buffer_head *fe_bh,
+                                  struct ocfs2_extent_tree *et,
                                  struct ocfs2_alloc_context *meta_ac,
                                  struct buffer_head **ret_new_eb_bh)
 {
        int status, i;
        u32 new_clusters;
        struct buffer_head *new_eb_bh = NULL;
-        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_extent_list  *fe_el;
+        struct ocfs2_extent_list  *root_el;
        struct ocfs2_extent_list  *eb_el;
        mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        }
        eb_el = &eb->h_list;
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        root_el = et->et_root_el;
-        fe_el = &fe->id2.i_list;
        status = ocfs2_journal_access(handle, inode, new_eb_bh,
                                      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
-        /* copy the fe data into the new extent block */
+        /* copy the root extent list data into the new extent block */
-        eb_el->l_tree_depth = fe_el->l_tree_depth;
+        eb_el->l_tree_depth = root_el->l_tree_depth;
-        eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+        eb_el->l_next_free_rec = root_el->l_next_free_rec;
-        for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
-                eb_el->l_recs[i] = fe_el->l_recs[i];
+                eb_el->l_recs[i] = root_el->l_recs[i];
        status = ocfs2_journal_dirty(handle, new_eb_bh);
        if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
-        /* update fe now */
+        /* update root_bh now */
-        le16_add_cpu(&fe_el->l_tree_depth, 1);
+        le16_add_cpu(&root_el->l_tree_depth, 1);
-        fe_el->l_recs[0].e_cpos = 0;
+        root_el->l_recs[0].e_cpos = 0;
-        fe_el->l_recs[0].e_blkno = eb->h_blkno;
+        root_el->l_recs[0].e_blkno = eb->h_blkno;
-        fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+        root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-        for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+        for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
-                memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+                memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-        fe_el->l_next_free_rec = cpu_to_le16(1);
+        root_el->l_next_free_rec = cpu_to_le16(1);
        /* If this is our 1st tree depth shift, then last_eb_blk
         * becomes the allocated extent block */
-        if (fe_el->l_tree_depth == cpu_to_le16(1))
+        if (root_el->l_tree_depth == cpu_to_le16(1))
-                fe->i_last_eb_blk = eb->h_blkno;
+                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
-        status = ocfs2_journal_dirty(handle, fe_bh);
+        status = ocfs2_journal_dirty(handle, et->et_root_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        new_eb_bh = NULL;
        status = 0;
 bail:
-        if (new_eb_bh)
+        brelse(new_eb_bh);
-                brelse(new_eb_bh);
        mlog_exit(status);
        return status;
@@ -817,22 +1128,21 @@ bail:
 * 1) a lowest extent block is found, then we pass it back in
 *    *lowest_eb_bh and return '0'
 *
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
 *    pass NULL back in *lowest_eb_bh, but still return '0'
 *
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
 *    which case we return > 0
 *
 * return status < 0 indicates an error.
 */
 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                                    struct inode *inode,
-                                    struct buffer_head *fe_bh,
+                                    struct ocfs2_extent_tree *et,
                                    struct buffer_head **target_bh)
 {
        int status = 0, i;
        u64 blkno;
-        struct ocfs2_dinode *fe;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list  *el;
        struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
        *target_bh = NULL;
-        fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        el = et->et_root_el;
-        el = &fe->id2.i_list;
        while(le16_to_cpu(el->l_tree_depth) > 1) {
                if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                        goto bail;
                }
-                if (bh) {
+                brelse(bh);
-                        brelse(bh);
+                bh = NULL;
-                        bh = NULL;
-                }
-                status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+                status = ocfs2_read_block(inode, blkno, &bh);
-                                          inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                if (le16_to_cpu(el->l_next_free_rec) <
                    le16_to_cpu(el->l_count)) {
-                        if (lowest_bh)
+                        brelse(lowest_bh);
-                                brelse(lowest_bh);
                        lowest_bh = bh;
                        get_bh(lowest_bh);
                }
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
        /* If we didn't find one and the fe doesn't have any room,
         * then return '1' */
-        if (!lowest_bh
+        el = et->et_root_el;
-            && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+        if (!lowest_bh && (el->l_next_free_rec == el->l_count))
                status = 1;
        *target_bh = lowest_bh;
 bail:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(status);
        return status;
@@ -919,19 +1223,19 @@ bail:
 * *last_eb_bh will be updated by ocfs2_add_branch().
 */
 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
-                           struct buffer_head *di_bh, int *final_depth,
+                           struct ocfs2_extent_tree *et, int *final_depth,
                           struct buffer_head **last_eb_bh,
                           struct ocfs2_alloc_context *meta_ac)
 {
        int ret, shift;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_list *el = et->et_root_el;
-        int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+        int depth = le16_to_cpu(el->l_tree_depth);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *bh = NULL;
        BUG_ON(meta_ac == NULL);
-        shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+        shift = ocfs2_find_branch_target(osb, inode, et, &bh);
        if (shift < 0) {
                ret = shift;
                mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
                /* ocfs2_shift_tree_depth will return us a buffer with
                 * the new extent block (so we can pass that to
                 * ocfs2_add_branch). */
-                ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+                ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
                                             meta_ac, &bh);
                if (ret < 0) {
                        mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
        /* call ocfs2_add_branch to add the final part of the tree with
         * the new data. */
        mlog(0, "add branch. bh = %p\n", bh);
-        ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+        ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
                               meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
@@ -990,15 +1294,6 @@ out:
 }
 /*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
-        return !rec->e_leaf_clusters;
-}
-/*
 * This function will discard the rightmost extent record.
 */
 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
@@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
                brelse(bh);
                bh = NULL;
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+                ret = ocfs2_read_block(inode, blkno, &bh);
-                                       &bh, OCFS2_BH_CACHED, inode);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                                     struct ocfs2_path *right_path,
                                     int subtree_index,
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
-                                     int *deleted)
+                                     int *deleted,
+                                     struct ocfs2_extent_tree *et)
 {
        int ret, i, del_right_subtree = 0, right_has_empty = 0;
-        struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+        struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
        struct ocfs2_extent_block *eb;
@@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                 * We have to update i_last_eb_blk during the meta
                 * data delete.
                 */
-                ret = ocfs2_journal_access(handle, inode, di_bh,
+                ret = ocfs2_journal_access(handle, inode, et_root_bh,
                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
@@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                ocfs2_update_edge_lengths(inode, handle, left_path);
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-                di->i_last_eb_blk = eb->h_blkno;
+                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
                /*
                 * Removal of the extent in the left leaf was skipped
@@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                if (right_has_empty)
                        ocfs2_remove_empty_extent(left_leaf_el);
-                ret = ocfs2_journal_dirty(handle, di_bh);
+                ret = ocfs2_journal_dirty(handle, et_root_bh);
                if (ret)
                        mlog_errno(ret);
@@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                                    handle_t *handle, int orig_credits,
                                    struct ocfs2_path *path,
                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
-                                    struct ocfs2_path **empty_extent_path)
+                                    struct ocfs2_path **empty_extent_path,
+                                    struct ocfs2_extent_tree *et)
 {
        int ret, subtree_root, deleted;
        u32 right_cpos;
@@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
                                                right_path, subtree_root,
-                                                dealloc, &deleted);
+                                                dealloc, &deleted, et);
                if (ret == -EAGAIN) {
                        /*
                         * The rotation has to temporarily stop due to
@@ -2447,29 +2742,20 @@ out:
 }
 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
-                                       struct ocfs2_path *path,
+                                struct ocfs2_path *path,
-                                       struct ocfs2_cached_dealloc_ctxt *dealloc)
+                                struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                struct ocfs2_extent_tree *et)
 {
        int ret, subtree_index;
        u32 cpos;
        struct ocfs2_path *left_path = NULL;
-        struct ocfs2_dinode *di;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        /*
-         * XXX: This code assumes that the root is an inode, which is
-         * true for now but may change as tree code gets generic.
-         */
-        di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                ret = -EIO;
-                ocfs2_error(inode->i_sb,
-                            "Inode %llu has invalid path root",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                goto out;
-        }
+        ret = ocfs2_et_sanity_check(inode, et);
+        if (ret)
+                goto out;
        /*
         * There's two ways we handle this depending on
         * whether path is the only existing one.
@@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                ocfs2_update_edge_lengths(inode, handle, left_path);
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-                di->i_last_eb_blk = eb->h_blkno;
+                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
        } else {
                /*
                 * 'path' is also the leftmost path which
@@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                 */
                ocfs2_unlink_path(inode, handle, dealloc, path, 1);
-                el = &di->id2.i_list;
+                el = et->et_root_el;
                el->l_tree_depth = 0;
                el->l_next_free_rec = 0;
                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
-                di->i_last_eb_blk = 0;
+                ocfs2_et_set_last_eb_blk(et, 0);
        }
        ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2570,7 +2856,8 @@ out:
 */
 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
                                  struct ocfs2_path *path,
-                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
+                                  struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                  struct ocfs2_extent_tree *et)
 {
        int ret, orig_credits = handle->h_buffer_credits;
        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
        if (path->p_tree_depth == 0) {
 rightmost_no_delete:
                /*
-                 * In-inode extents. This is trivially handled, so do
+                 * Inline extents. This is trivially handled, so do
                 * it up front.
                 */
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2638,7 +2925,7 @@ rightmost_no_delete:
                 */
                ret = ocfs2_remove_rightmost_path(inode, handle, path,
-                                                  dealloc);
+                                                  dealloc, et);
                if (ret)
                        mlog_errno(ret);
                goto out;
@@ -2650,7 +2937,7 @@ rightmost_no_delete:
         */
 try_rotate:
        ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
-                                       dealloc, &restart_path);
+                                       dealloc, &restart_path, et);
        if (ret && ret != -EAGAIN) {
                mlog_errno(ret);
                goto out;
@@ -2662,7 +2949,7 @@ try_rotate:
                ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
                                               tmp_path, dealloc,
-                                               &restart_path);
+                                               &restart_path, et);
                if (ret && ret != -EAGAIN) {
                        mlog_errno(ret);
                        goto out;
@@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                                handle_t *handle,
                                struct ocfs2_extent_rec *split_rec,
                                struct ocfs2_cached_dealloc_ctxt *dealloc,
+                                struct ocfs2_extent_tree *et,
                                int index)
 {
        int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                    le16_to_cpu(el->l_next_free_rec) == 1) {
                        ret = ocfs2_remove_rightmost_path(inode, handle,
-                                                          right_path, dealloc);
+                                                          right_path,
+                                                          dealloc, et);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                                     int split_index,
                                     struct ocfs2_extent_rec *split_rec,
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
-                                     struct ocfs2_merge_ctxt *ctxt)
+                                     struct ocfs2_merge_ctxt *ctxt,
+                                     struct ocfs2_extent_tree *et)
 {
        int ret = 0;
@@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * illegal.
                 */
                ret = ocfs2_rotate_tree_left(inode, handle, path,
-                                             dealloc);
+                                             dealloc, et);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
                /* The merge left us with an empty extent, remove it. */
-                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+                ret = ocfs2_rotate_tree_left(inode, handle, path,
+                                             dealloc, et);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                ret = ocfs2_merge_rec_left(inode, path,
                                           handle, rec,
-                                           dealloc,
+                                           dealloc, et,
                                           split_index);
                if (ret) {
@@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                }
                ret = ocfs2_rotate_tree_left(inode, handle, path,
-                                             dealloc);
+                                             dealloc, et);
                /*
                 * Error from this last rotate is not critical, so
                 * print but don't bubble it up.
@@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                        ret = ocfs2_merge_rec_left(inode,
                                                   path,
                                                   handle, split_rec,
-                                                   dealloc,
+                                                   dealloc, et,
                                                   split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                         * our leaf. Try to rotate it away.
                         */
                        ret = ocfs2_rotate_tree_left(inode, handle, path,
-                                                     dealloc);
+                                                     dealloc, et);
                        if (ret)
                                mlog_errno(ret);
                        ret = 0;
@@ -3356,16 +3647,6 @@ rotate:
        ocfs2_rotate_leaf(el, insert_rec);
 }
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
-                                                struct ocfs2_dinode *di,
-                                                u32 clusters)
-{
-        le32_add_cpu(&di->i_clusters, clusters);
-        spin_lock(&OCFS2_I(inode)->ip_lock);
-        OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
-        spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
 static void ocfs2_adjust_rightmost_records(struct inode *inode,
                                           handle_t *handle,
                                           struct ocfs2_path *path,
@@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
 }
 /*
- * This function only does inserts on an allocation b-tree. For dinode
+ * This function only does inserts on an allocation b-tree. For tree
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
 *
 * right_path is the path we want to do the actual insert
 * in. left_path should only be passed in if we need to update that
@@ -3665,7 +3946,7 @@ out:
 static int ocfs2_do_insert_extent(struct inode *inode,
                                  handle_t *handle,
-                                  struct buffer_head *di_bh,
+                                  struct ocfs2_extent_tree *et,
                                  struct ocfs2_extent_rec *insert_rec,
                                  struct ocfs2_insert_type *type)
 {
@@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        u32 cpos;
        struct ocfs2_path *right_path = NULL;
        struct ocfs2_path *left_path = NULL;
-        struct ocfs2_dinode *di;
        struct ocfs2_extent_list *el;
-        di = (struct ocfs2_dinode *) di_bh->b_data;
+        el = et->et_root_el;
-        el = &di->id2.i_list;
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
@@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                goto out_update_clusters;
        }
-        right_path = ocfs2_new_inode_path(di_bh);
+        right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                 * ocfs2_rotate_tree_right() might have extended the
                 * transaction without re-journaling our tree root.
                 */
-                ret = ocfs2_journal_access(handle, inode, di_bh,
+                ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
@@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 out_update_clusters:
        if (type->ins_split == SPLIT_NONE)
-                ocfs2_update_dinode_clusters(inode, di,
+                ocfs2_et_update_clusters(inode, et,
-                                             le16_to_cpu(insert_rec->e_leaf_clusters));
+                                         le16_to_cpu(insert_rec->e_leaf_clusters));
-        ret = ocfs2_journal_dirty(handle, di_bh);
+        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
        if (ret)
                mlog_errno(ret);
@@ -3899,7 +4178,8 @@ out:
 static void ocfs2_figure_contig_type(struct inode *inode,
                                     struct ocfs2_insert_type *insert,
                                     struct ocfs2_extent_list *el,
-                                     struct ocfs2_extent_rec *insert_rec)
+                                     struct ocfs2_extent_rec *insert_rec,
+                                     struct ocfs2_extent_tree *et)
 {
        int i;
        enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
                }
        }
        insert->ins_contig = contig_type;
+        if (insert->ins_contig != CONTIG_NONE) {
+                struct ocfs2_extent_rec *rec =
+                                &el->l_recs[insert->ins_contig_index];
+                unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+                                   le16_to_cpu(insert_rec->e_leaf_clusters);
+                /*
+                 * Caller might want us to limit the size of extents, don't
+                 * calculate contiguousness if we might exceed that limit.
+                 */
+                if (et->et_max_leaf_clusters &&
+                    (len > et->et_max_leaf_clusters))
+                        insert->ins_contig = CONTIG_NONE;
+        }
 }
 /*
@@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 * ocfs2_figure_appending_type() will figure out whether we'll have to
 * insert at the tail of the rightmost leaf.
 *
- * This should also work against the dinode list for tree's with 0
+ * This should also work against the root extent list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * depth. If we consider the root extent list to be the rightmost leaf node
 * then the logic here makes sense.
 */
 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3975,14 +4270,13 @@ set_tail_append:
 * structure.
 */
 static int ocfs2_figure_insert_type(struct inode *inode,
-                                    struct buffer_head *di_bh,
+                                    struct ocfs2_extent_tree *et,
                                    struct buffer_head **last_eb_bh,
                                    struct ocfs2_extent_rec *insert_rec,
                                    int *free_records,
                                    struct ocfs2_insert_type *insert)
 {
        int ret;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        struct ocfs2_path *path = NULL;
@@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
        insert->ins_split = SPLIT_NONE;
-        el = &di->id2.i_list;
+        el = et->et_root_el;
        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
        if (el->l_tree_depth) {
@@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
                 * may want it later.
                 */
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
-                                       le64_to_cpu(di->i_last_eb_blk), &bh,
-                                       OCFS2_BH_CACHED, inode);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                le16_to_cpu(el->l_next_free_rec);
        if (!insert->ins_tree_depth) {
-                ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+                ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
                ocfs2_figure_appending_type(insert, el, insert_rec);
                return 0;
        }
-        path = ocfs2_new_inode_path(di_bh);
+        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
         *     into two types of appends: simple record append, or a
         *     rotate inside the tail leaf.
         */
-        ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+        ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
        /*
         * The insert code isn't quite ready to deal with all cases of
@@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
         * the case that we're doing a tail append, so maybe we can
         * take advantage of that information somehow.
         */
-        if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+        if (ocfs2_et_get_last_eb_blk(et) ==
+            path_leaf_bh(path)->b_blocknr) {
                /*
                 * Ok, ocfs2_find_path() returned us the rightmost
                 * tree path. This might be an appending insert. There are
@@ -4108,7 +4401,7 @@ out:
 int ocfs2_insert_extent(struct ocfs2_super *osb,
                        handle_t *handle,
                        struct inode *inode,
-                        struct buffer_head *fe_bh,
+                        struct ocfs2_extent_tree *et,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
@@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        struct ocfs2_insert_type insert = {0, };
        struct ocfs2_extent_rec rec;
-        BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
        mlog(0, "add %u clusters at position %u to inode %llu\n",
             new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-                        (OCFS2_I(inode)->ip_clusters != cpos),
-                        "Device %s, asking for sparse allocation: inode %llu, "
-                        "cpos %u, clusters %u\n",
-                        osb->dev_str,
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
-                        OCFS2_I(inode)->ip_clusters);
        memset(&rec, 0, sizeof(rec));
        rec.e_cpos = cpu_to_le32(cpos);
        rec.e_blkno = cpu_to_le64(start_blk);
        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
        rec.e_flags = flags;
+        status = ocfs2_et_insert_check(inode, et, &rec);
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
-        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+        status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
                                          &free_records, &insert);
        if (status < 0) {
                mlog_errno(status);
@@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
             free_records, insert.ins_tree_depth);
        if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
-                status = ocfs2_grow_tree(inode, handle, fe_bh,
+                status = ocfs2_grow_tree(inode, handle, et,
                                         &insert.ins_tree_depth, &last_eb_bh,
                                         meta_ac);
                if (status) {
@@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
        }
        /* Finally, we can add clusters. This might rotate the tree for us. */
-        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+        status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
        if (status < 0)
                mlog_errno(status);
-        else
+        else if (et->et_ops == &ocfs2_dinode_et_ops)
                ocfs2_extent_map_insert_rec(inode, &rec);
 bail:
-        if (last_eb_bh)
+        brelse(last_eb_bh);
-                brelse(last_eb_bh);
+        mlog_exit(status);
+        return status;
+}
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+                                struct inode *inode,
+                                u32 *logical_offset,
+                                u32 clusters_to_add,
+                                int mark_unwritten,
+                                struct ocfs2_extent_tree *et,
+                                handle_t *handle,
+                                struct ocfs2_alloc_context *data_ac,
+                                struct ocfs2_alloc_context *meta_ac,
+                                enum ocfs2_alloc_restarted *reason_ret)
+{
+        int status = 0;
+        int free_extents;
+        enum ocfs2_alloc_restarted reason = RESTART_NONE;
+        u32 bit_off, num_bits;
+        u64 block;
+        u8 flags = 0;
+        BUG_ON(!clusters_to_add);
+        if (mark_unwritten)
+                flags = OCFS2_EXT_UNWRITTEN;
+        free_extents = ocfs2_num_free_extents(osb, inode, et);
+        if (free_extents < 0) {
+                status = free_extents;
+                mlog_errno(status);
+                goto leave;
+        }
+        /* there are two cases which could cause us to EAGAIN in the
+         * we-need-more-metadata case:
+         * 1) we haven't reserved *any*
+         * 2) we are so fragmented, we've needed to add metadata too
+         *    many times. */
+        if (!free_extents && !meta_ac) {
+                mlog(0, "we haven't reserved any metadata!\n");
+                status = -EAGAIN;
+                reason = RESTART_META;
+                goto leave;
+        } else if ((!free_extents)
+                   && (ocfs2_alloc_context_bits_left(meta_ac)
+                       < ocfs2_extend_meta_needed(et->et_root_el))) {
+                mlog(0, "filesystem is really fragmented...\n");
+                status = -EAGAIN;
+                reason = RESTART_META;
+                goto leave;
+        }
+        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+                                        clusters_to_add, &bit_off, &num_bits);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
+        BUG_ON(num_bits > clusters_to_add);
+        /* reserve our write early -- insert_extent may update the inode */
+        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        status = ocfs2_insert_extent(osb, handle, inode, et,
+                                     *logical_offset, block,
+                                     num_bits, flags, meta_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        status = ocfs2_journal_dirty(handle, et->et_root_bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        clusters_to_add -= num_bits;
+        *logical_offset += num_bits;
+        if (clusters_to_add) {
+                mlog(0, "need to alloc once more, wanted = %u\n",
+                     clusters_to_add);
+                status = -EAGAIN;
+                reason = RESTART_TRANS;
+        }
+leave:
        mlog_exit(status);
+        if (reason_ret)
+                *reason_ret = reason;
        return status;
 }
@@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
 static int ocfs2_split_and_insert(struct inode *inode,
                                  handle_t *handle,
                                  struct ocfs2_path *path,
-                                  struct buffer_head *di_bh,
+                                  struct ocfs2_extent_tree *et,
                                  struct buffer_head **last_eb_bh,
                                  int split_index,
                                  struct ocfs2_extent_rec *orig_split_rec,
@@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
        struct ocfs2_extent_rec split_rec = *orig_split_rec;
        struct ocfs2_insert_type insert;
        struct ocfs2_extent_block *eb;
-        struct ocfs2_dinode *di;
 leftright:
        /*
@@ -4224,8 +4618,7 @@ leftright:
         */
        rec = path_leaf_el(path)->l_recs[split_index];
-        di = (struct ocfs2_dinode *)di_bh->b_data;
+        rightmost_el = et->et_root_el;
-        rightmost_el = &di->id2.i_list;
        depth = le16_to_cpu(rightmost_el->l_tree_depth);
        if (depth) {
@@ -4236,8 +4629,8 @@ leftright:
        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
            le16_to_cpu(rightmost_el->l_count)) {
-                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
+                ret = ocfs2_grow_tree(inode, handle, et,
-                                      meta_ac);
+                                      &depth, last_eb_bh, meta_ac);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4274,8 +4667,7 @@ leftright:
                do_leftright = 1;
        }
-        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
+        ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
-                                     &insert);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4317,8 +4709,9 @@ out:
 * of the tree is required. All other cases will degrade into a less
 * optimal tree layout.
 *
- * last_eb_bh should be the rightmost leaf block for any inode with a
+ * last_eb_bh should be the rightmost leaf block for any extent
- * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
 *
 * This code is optimized for readability - several passes might be
 * made over certain portions of the tree. All of those blocks will
@@ -4326,7 +4719,7 @@ out:
 * extra overhead is not expressed in terms of disk reads.
 */
 static int __ocfs2_mark_extent_written(struct inode *inode,
-                                       struct buffer_head *di_bh,
+                                       struct ocfs2_extent_tree *et,
                                       handle_t *handle,
                                       struct ocfs2_path *path,
                                       int split_index,
@@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
         */
        if (path->p_tree_depth) {
                struct ocfs2_extent_block *eb;
-                struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-                                       le64_to_cpu(di->i_last_eb_blk),
+                                       &last_eb_bh);
-                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                if (ctxt.c_split_covers_rec)
                        el->l_recs[split_index] = *split_rec;
                else
-                        ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+                        ret = ocfs2_split_and_insert(inode, handle, path, et,
                                                     &last_eb_bh, split_index,
                                                     split_rec, meta_ac);
                if (ret)
@@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        } else {
                ret = ocfs2_try_to_merge_extent(inode, handle, path,
                                                split_index, split_rec,
-                                                dealloc, &ctxt);
+                                                dealloc, &ctxt, et);
                if (ret)
                        mlog_errno(ret);
        }
@@ -4429,7 +4820,8 @@ out:
 *
 * The caller is responsible for passing down meta_ac if we'll need it.
 */
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+                              struct ocfs2_extent_tree *et,
                              handle_t *handle, u32 cpos, u32 len, u32 phys,
                              struct ocfs2_alloc_context *meta_ac,
                              struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
        /*
         * XXX: This should be fixed up so that we just re-insert the
         * next extent records.
+         *
+         * XXX: This is a hack on the extent tree, maybe it should be
+         * an op?
         */
-        ocfs2_extent_map_trunc(inode, 0);
+        if (et->et_ops == &ocfs2_dinode_et_ops)
+                ocfs2_extent_map_trunc(inode, 0);
-        left_path = ocfs2_new_inode_path(di_bh);
+        left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
        split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
        split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
-        ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
+        ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
-                                          index, &split_rec, meta_ac, dealloc);
+                                          index, &split_rec, meta_ac,
+                                          dealloc);
        if (ret)
                mlog_errno(ret);
@@ -4499,13 +4896,12 @@ out:
        return ret;
 }
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
                            handle_t *handle, struct ocfs2_path *path,
                            int index, u32 new_range,
                            struct ocfs2_alloc_context *meta_ac)
 {
        int ret, depth, credits = handle->h_buffer_credits;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct buffer_head *last_eb_bh = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *rightmost_el, *el;
@@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
        depth = path->p_tree_depth;
        if (depth > 0) {
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-                                       le64_to_cpu(di->i_last_eb_blk),
+                                       &last_eb_bh);
-                                       &last_eb_bh, OCFS2_BH_CACHED, inode);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
        } else
                rightmost_el = path_leaf_el(path);
-        credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+        credits += path->p_tree_depth +
+                   ocfs2_extend_meta_needed(et->et_root_el);
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
@@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
            le16_to_cpu(rightmost_el->l_count)) {
-                ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+                ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
                                      meta_ac);
                if (ret) {
                        mlog_errno(ret);
@@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
        insert.ins_split = SPLIT_RIGHT;
        insert.ins_tree_depth = depth;
-        ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+        ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
        if (ret)
                mlog_errno(ret);
@@ -4570,7 +4966,8 @@ out:
 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
                              struct ocfs2_path *path, int index,
                              struct ocfs2_cached_dealloc_ctxt *dealloc,
-                              u32 cpos, u32 len)
+                              u32 cpos, u32 len,
+                              struct ocfs2_extent_tree *et)
 {
        int ret;
        u32 left_cpos, rec_range, trunc_range;
@@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
        struct ocfs2_extent_block *eb;
        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
-                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+                ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
        ocfs2_journal_dirty(handle, path_leaf_bh(path));
-        ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+        ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4724,7 +5121,8 @@ out:
        return ret;
 }
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+                        struct ocfs2_extent_tree *et,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
        u32 rec_range, trunc_range;
        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_list *el;
-        struct ocfs2_path *path;
+        struct ocfs2_path *path = NULL;
        ocfs2_extent_map_trunc(inode, 0);
-        path = ocfs2_new_inode_path(di_bh);
+        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-                                         cpos, len);
+                                         cpos, len, et);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        } else {
-                ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+                ret = ocfs2_split_tree(inode, et, handle, path, index,
                                       trunc_range, meta_ac);
                if (ret) {
                        mlog_errno(ret);
@@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
                }
                ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-                                         cpos, len);
+                                         cpos, len, et);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
-                                  OCFS2_BH_CACHED, inode);
        if (status < 0) {
                iput(inode);
                mlog_errno(status);
@@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 bail:
        if (tl_inode)
                iput(tl_inode);
-        if (tl_bh)
+        brelse(tl_bh);
-                brelse(tl_bh);
        if (status < 0 && (*tl_copy)) {
                kfree(*tl_copy);
@@ -6008,20 +6404,13 @@ bail:
        return status;
 }
-static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
        set_buffer_uptodate(bh);
        mark_buffer_dirty(bh);
        return 0;
 }
-static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
-{
-        set_buffer_uptodate(bh);
-        mark_buffer_dirty(bh);
-        return ocfs2_journal_dirty_data(handle, bh);
-}
 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
                                     unsigned int from, unsigned int to,
                                     struct page *page, int zero, u64 *phys)
@@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
         * here if they aren't - ocfs2_map_page_blocks()
         * might've skipped some
         */
-        if (ocfs2_should_order_data(inode)) {
+        ret = walk_page_buffers(handle, page_buffers(page),
-                ret = walk_page_buffers(handle,
+                                from, to, &partial,
-                                        page_buffers(page),
+                                ocfs2_zero_func);
-                                        from, to, &partial,
+        if (ret < 0)
-                                        ocfs2_ordered_zero_func);
+                mlog_errno(ret);
-                if (ret < 0)
+        else if (ocfs2_should_order_data(inode)) {
-                        mlog_errno(ret);
+                ret = ocfs2_jbd2_file_inode(handle, inode);
-        } else {
+#ifdef CONFIG_OCFS2_COMPAT_JBD
                ret = walk_page_buffers(handle, page_buffers(page),
                                        from, to, &partial,
-                                        ocfs2_writeback_zero_func);
+                                        ocfs2_journal_dirty_data);
+#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -6215,20 +6605,29 @@ out:
        return ret;
 }
-static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+                                             struct ocfs2_dinode *di)
 {
        unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+        unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
-        memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+        if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+                memset(&di->id2, 0, blocksize -
+                                    offsetof(struct ocfs2_dinode, id2) -
+                                    xattrsize);
+        else
+                memset(&di->id2, 0, blocksize -
+                                    offsetof(struct ocfs2_dinode, id2));
 }
 void ocfs2_dinode_new_extent_list(struct inode *inode,
                                  struct ocfs2_dinode *di)
 {
-        ocfs2_zero_dinode_id2(inode, di);
+        ocfs2_zero_dinode_id2_with_xattr(inode, di);
        di->id2.i_list.l_tree_depth = 0;
        di->id2.i_list.l_next_free_rec = 0;
-        di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+        di->id2.i_list.l_count = cpu_to_le16(
+                ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
 }
 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
         * We clear the entire i_data structure here so that all
         * fields can be properly initialized.
         */
-        ocfs2_zero_dinode_id2(inode, di);
+        ocfs2_zero_dinode_id2_with_xattr(inode, di);
-        idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+        idata->id_count = cpu_to_le16(
+                        ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
 }
 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        struct ocfs2_alloc_context *data_ac = NULL;
        struct page **pages = NULL;
        loff_t end = osb->s_clustersize;
+        struct ocfs2_extent_tree et;
        has_data = i_size_read(inode) ? 1 : 0;
@@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                 * this proves to be false, we could always re-build
                 * the in-inode data from our pages.
                 */
-                ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
+                ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+                ret = ocfs2_insert_extent(osb, handle, inode, &et,
                                          0, block, 1, 0, NULL);
                if (ret) {
                        mlog_errno(ret);
@@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        handle_t *handle = NULL;
        struct inode *tl_inode = osb->osb_tl_inode;
        struct ocfs2_path *path = NULL;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
        mlog_entry_void();
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_inode_path(fe_bh);
+        path = ocfs2_new_path(fe_bh, &di->id2.i_list);
        if (!path) {
                status = -ENOMEM;
                mlog_errno(status);
@@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+                status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
-                                          &last_eb_bh, OCFS2_BH_CACHED, inode);
+                                          &last_eb_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
                mlog(ML_NOTICE,
                     "Truncate completion has non-empty dealloc context\n");
-        if (tc->tc_last_eb_bh)
+        brelse(tc->tc_last_eb_bh);
-                brelse(tc->tc_last_eb_bh);
        kfree(tc);
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd8011..70257c84cfbe 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+        struct ocfs2_extent_tree_operations     *et_ops;
+        struct buffer_head                      *et_root_bh;
+        struct ocfs2_extent_list                *et_root_el;
+        void                                    *et_object;
+        unsigned int                            et_max_leaf_clusters;
+};
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+                                   struct inode *inode,
+                                   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+                                       struct inode *inode,
+                                       struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+                                        struct inode *inode,
+                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_root *xv);
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
                        handle_t *handle,
                        struct inode *inode,
-                        struct buffer_head *fe_bh,
+                        struct ocfs2_extent_tree *et,
                        u32 cpos,
                        u64 start_blk,
                        u32 new_clusters,
                        u8 flags,
                        struct ocfs2_alloc_context *meta_ac);
+enum ocfs2_alloc_restarted {
+        RESTART_NONE = 0,
+        RESTART_TRANS,
+        RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+                                struct inode *inode,
+                                u32 *logical_offset,
+                                u32 clusters_to_add,
+                                int mark_unwritten,
+                                struct ocfs2_extent_tree *et,
+                                handle_t *handle,
+                                struct ocfs2_alloc_context *data_ac,
+                                struct ocfs2_alloc_context *meta_ac,
+                                enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+                              struct ocfs2_extent_tree *et,
                              handle_t *handle, u32 cpos, u32 len, u32 phys,
                              struct ocfs2_alloc_context *meta_ac,
                              struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+                        struct ocfs2_extent_tree *et,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
-                           struct ocfs2_dinode *fe);
+                           struct ocfs2_extent_tree *et);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
 {
        /*
         * Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
         * new tree_depth==0 extent_block, and one block at the new
         * top-of-the tree.
         */
-        return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+        return le16_to_cpu(root_el->l_tree_depth) + 2;
 }
 void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
@@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
                return le16_to_cpu(rec->e_leaf_clusters);
 }
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+        return !rec->e_leaf_clusters;
+}
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 506c24fb5078..c22543b33420 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
-                                  OCFS2_I(inode)->ip_blkno,
-                                  &bh, OCFS2_BH_CACHED, inode);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
        err = 0;
 bail:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(err);
        return err;
@@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 {
        int ret;
        struct buffer_head *di_bh = NULL;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        BUG_ON(!PageLocked(page));
        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
-        ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
+        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
-                               OCFS2_BH_CACHED, inode);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        }
        if (ocfs2_should_order_data(inode)) {
+                ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
                ret = walk_page_buffers(handle,
                                        page_buffers(page),
                                        from, to, NULL,
                                        ocfs2_journal_dirty_data);
-                if (ret < 0) 
+#endif
+                if (ret < 0)
                        mlog_errno(ret);
        }
 out:
@@ -594,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
                ocfs2_error(inode->i_sb,
                            "Inode %llu has a hole at block %llu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-        journal_invalidatepage(journal, page, offset);
+        jbd2_journal_invalidatepage(journal, page, offset);
 }
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        if (!page_has_buffers(page))
                return 0;
-        return journal_try_to_free_buffers(journal, page, wait);
+        return jbd2_journal_try_to_free_buffers(journal, page, wait);
 }
 static ssize_t ocfs2_direct_IO(int rw,
@@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode,
                tmppage = wc->w_pages[i];
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode))
+                        if (ocfs2_should_order_data(inode)) {
+                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
                                walk_page_buffers(wc->w_handle,
                                                  page_buffers(tmppage),
                                                  from, to, NULL,
                                                  ocfs2_journal_dirty_data);
+#endif
+                        }
                        block_commit_write(tmppage, from, to);
                }
@@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
        int ret, i, new, should_zero = 0;
        u64 v_blkno, p_blkno;
        struct inode *inode = mapping->host;
+        struct ocfs2_extent_tree et;
        new = phys == 0 ? 1 : 0;
        if (new || unwritten)
@@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                 * any additional semaphores or cluster locks.
                 */
                tmp_pos = cpos;
-                ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+                ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
-                                                 &tmp_pos, 1, 0, wc->w_di_bh,
+                                           &tmp_pos, 1, 0, wc->w_di_bh,
-                                                 wc->w_handle, data_ac,
+                                           wc->w_handle, data_ac,
-                                                 meta_ac, NULL);
+                                           meta_ac, NULL);
                /*
                 * This shouldn't happen because we must have already
                 * calculated the correct meta data allocation required. The
@@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                        goto out;
                }
        } else if (unwritten) {
-                ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+                ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+                ret = ocfs2_mark_extent_written(inode, &et,
                                                wc->w_handle, cpos, 1, phys,
                                                meta_ac, &wc->w_dealloc);
                if (ret < 0) {
@@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle;
+        struct ocfs2_extent_tree et;
        ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
        if (ret) {
@@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                 * ocfs2_lock_allocators(). It greatly over-estimates
                 * the work to be done.
                 */
-                ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+                mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
-                                            extents_to_split, &data_ac, &meta_ac);
+                     " clusters_to_add = %u, extents_to_split = %u\n",
+                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+                     clusters_to_alloc, extents_to_split);
+                ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+                ret = ocfs2_lock_allocators(inode, &et,
+                                            clusters_to_alloc, extents_to_split,
+                                            &data_ac, &meta_ac);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+                credits = ocfs2_calc_extend_credits(inode->i_sb,
+                                                    &di->id2.i_list,
                                                    clusters_to_alloc);
        }
@@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode))
+                        if (ocfs2_should_order_data(inode)) {
+                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
                                walk_page_buffers(wc->w_handle,
                                                  page_buffers(tmppage),
                                                  from, to, NULL,
                                                  ocfs2_journal_dirty_data);
+#endif
+                        }
                        block_commit_write(tmppage, from, to);
                }
        }
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b41..7e947c672469 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
        /* remove from dirty list before I/O. */
        clear_buffer_dirty(bh);
-        get_bh(bh); /* for end_buffer_write_sync() */                   
+        get_bh(bh); /* for end_buffer_write_sync() */
        bh->b_end_io = end_buffer_write_sync;
        submit_bh(WRITE, bh);
@@ -88,22 +88,103 @@ out:
        return ret;
 }
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
-                      struct buffer_head *bhs[], int flags,
+                           unsigned int nr, struct buffer_head *bhs[])
-                      struct inode *inode)
+{
+        int status = 0;
+        unsigned int i;
+        struct buffer_head *bh;
+        if (!nr) {
+                mlog(ML_BH_IO, "No buffers will be read!\n");
+                goto bail;
+        }
+        for (i = 0 ; i < nr ; i++) {
+                if (bhs[i] == NULL) {
+                        bhs[i] = sb_getblk(osb->sb, block++);
+                        if (bhs[i] == NULL) {
+                                status = -EIO;
+                                mlog_errno(status);
+                                goto bail;
+                        }
+                }
+                bh = bhs[i];
+                if (buffer_jbd(bh)) {
+                        mlog(ML_ERROR,
+                             "trying to sync read a jbd "
+                             "managed bh (blocknr = %llu), skipping\n",
+                             (unsigned long long)bh->b_blocknr);
+                        continue;
+                }
+                if (buffer_dirty(bh)) {
+                        /* This should probably be a BUG, or
+                         * at least return an error. */
+                        mlog(ML_ERROR,
+                             "trying to sync read a dirty "
+                             "buffer! (blocknr = %llu), skipping\n",
+                             (unsigned long long)bh->b_blocknr);
+                        continue;
+                }
+                lock_buffer(bh);
+                if (buffer_jbd(bh)) {
+                        mlog(ML_ERROR,
+                             "block %llu had the JBD bit set "
+                             "while I was in lock_buffer!",
+                             (unsigned long long)bh->b_blocknr);
+                        BUG();
+                }
+                clear_buffer_uptodate(bh);
+                get_bh(bh); /* for end_buffer_read_sync() */
+                bh->b_end_io = end_buffer_read_sync;
+                submit_bh(READ, bh);
+        }
+        for (i = nr; i > 0; i--) {
+                bh = bhs[i - 1];
+                if (buffer_jbd(bh)) {
+                        mlog(ML_ERROR,
+                             "the journal got the buffer while it was "
+                             "locked for io! (blocknr = %llu)\n",
+                             (unsigned long long)bh->b_blocknr);
+                        BUG();
+                }
+                wait_on_buffer(bh);
+                if (!buffer_uptodate(bh)) {
+                        /* Status won't be cleared from here on out,
+                         * so we can safely record this and loop back
+                         * to cleanup the other buffers. */
+                        status = -EIO;
+                        put_bh(bh);
+                        bhs[i - 1] = NULL;
+                }
+        }
+bail:
+        return status;
+}
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+                      struct buffer_head *bhs[], int flags)
 {
        int status = 0;
-        struct super_block *sb;
        int i, ignore_cache = 0;
        struct buffer_head *bh;
-        mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
+        mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
-                   (unsigned long long)block, nr, flags, inode);
+                   inode, (unsigned long long)block, nr, flags);
+        BUG_ON(!inode);
        BUG_ON((flags & OCFS2_BH_READAHEAD) &&
-               (!inode || !(flags & OCFS2_BH_CACHED)));
+               (flags & OCFS2_BH_IGNORE_CACHE));
-        if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+        if (bhs == NULL) {
                status = -EINVAL;
                mlog_errno(status);
                goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                goto bail;
        }
-        sb = osb->sb;
+        mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
-        if (flags & OCFS2_BH_CACHED && !inode)
-                flags &= ~OCFS2_BH_CACHED;
-        if (inode)
-                mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
        for (i = 0 ; i < nr ; i++) {
                if (bhs[i] == NULL) {
-                        bhs[i] = sb_getblk(sb, block++);
+                        bhs[i] = sb_getblk(inode->i_sb, block++);
                        if (bhs[i] == NULL) {
-                                if (inode)
+                                mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
-                                        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
                                status = -EIO;
                                mlog_errno(status);
                                goto bail;
                        }
                }
                bh = bhs[i];
-                ignore_cache = 0;
+                ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
                /* There are three read-ahead cases here which we need to
                 * be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                 *    before our is-it-in-flight check.
                 */
-                if (flags & OCFS2_BH_CACHED &&
+                if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
-                    !ocfs2_buffer_uptodate(inode, bh)) {
                        mlog(ML_UPTODATE,
                             "bh (%llu), inode %llu not uptodate\n",
                             (unsigned long long)bh->b_blocknr,
                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        /* We're using ignore_cache here to say
+                         * "go to disk" */
                        ignore_cache = 1;
                }
                /* XXX: Can we ever get this and *not* have the cached
                 * flag set? */
                if (buffer_jbd(bh)) {
-                        if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+                        if (ignore_cache)
                                mlog(ML_BH_IO, "trying to sync read a jbd "
                                               "managed bh (blocknr = %llu)\n",
                                     (unsigned long long)bh->b_blocknr);
                        continue;
                }
-                if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+                if (ignore_cache) {
                        if (buffer_dirty(bh)) {
                                /* This should probably be a BUG, or
                                 * at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                         * previously read-ahead buffer may have
                         * completed I/O while we were waiting for the
                         * buffer lock. */
-                        if ((flags & OCFS2_BH_CACHED)
+                        if (!(flags & OCFS2_BH_IGNORE_CACHE)
                            && !(flags & OCFS2_BH_READAHEAD)
                            && ocfs2_buffer_uptodate(inode, bh)) {
                                unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                /* Always set the buffer in the cache, even if it was
                 * a forced read, or read-ahead which hasn't yet
                 * completed. */
-                if (inode)
+                ocfs2_set_buffer_uptodate(inode, bh);
-                        ocfs2_set_buffer_uptodate(inode, bh);
        }
-        if (inode)
+        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
-                mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
             (unsigned long long)block, nr,
-             (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+             ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+             flags);
 bail:
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e5..75e1dcb1ade7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
                             int uptodate);
-static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+static inline int ocfs2_read_block(struct inode        *inode,
                                   u64                  off,
-                                   struct buffer_head **bh,
+                                   struct buffer_head **bh);
-                                   int                  flags,
-                                   struct inode        *inode);
 int ocfs2_write_block(struct ocfs2_super          *osb,
                      struct buffer_head  *bh,
                      struct inode        *inode);
-int ocfs2_read_blocks(struct ocfs2_super          *osb,
+int ocfs2_read_blocks(struct inode        *inode,
                      u64                  block,
                      int                  nr,
                      struct buffer_head  *bhs[],
-                      int                  flags,
+                      int                  flags);
-                      struct inode        *inode);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+                           unsigned int nr, struct buffer_head *bhs[]);
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh);
-#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_IGNORE_CACHE      1
 #define OCFS2_BH_READAHEAD         8
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
-                                   struct buffer_head **bh, int flags,
+                                   struct buffer_head **bh)
-                                   struct inode *inode)
 {
        int status = 0;
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
                goto bail;
        }
-        status = ocfs2_read_blocks(osb, off, 1, bh,
+        status = ocfs2_read_blocks(inode, off, 1, bh, 0);
-                                   flags, inode);
 bail:
        return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f27529..d8a0cb92cef6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(CONN),
        define_mask(QUORUM),
        define_mask(EXPORT),
+        define_mask(XATTR),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94f..57670c680471 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
 #define ML_CONN         0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM       0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT       0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index d8bfa0eb41b2..52276c02f710 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
                           "  message id:   %d\n"
                           "  message type: %u\n"
                           "  message key:  0x%08x\n"
-                           "  sock acquiry: %lu.%lu\n"
+                           "  sock acquiry: %lu.%ld\n"
-                           "  send start:   %lu.%lu\n"
+                           "  send start:   %lu.%ld\n"
-                           "  wait start:   %lu.%lu\n",
+                           "  wait start:   %lu.%ld\n",
                           nst, (unsigned long)nst->st_task->pid,
                           (unsigned long)nst->st_task->tgid,
                           nst->st_task->comm, nst->st_node,
                           nst->st_sc, nst->st_id, nst->st_msg_type,
                           nst->st_msg_key,
                           nst->st_sock_time.tv_sec,
-                           (unsigned long)nst->st_sock_time.tv_usec,
+                           (long)nst->st_sock_time.tv_usec,
                           nst->st_send_time.tv_sec,
-                           (unsigned long)nst->st_send_time.tv_usec,
+                           (long)nst->st_send_time.tv_usec,
                           nst->st_status_time.tv_sec,
-                           nst->st_status_time.tv_usec);
+                           (long)nst->st_status_time.tv_usec);
        }
        spin_unlock(&o2net_debug_lock);
@@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
@@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                           "  remote node:     %s\n"
                           "  page off:        %zu\n"
                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%lu\n"
+                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%lu\n"
+                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%lu\n"
+                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%lu\n"
+                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%lu\n"
+                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%lu\n"
+                           "  func stop:       %lu.%ld\n"
                           "  func key:        %u\n"
                           "  func type:       %u\n",
                           sc,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd6..2bcf706d9dd3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 #ifdef CONFIG_DEBUG_FS
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                    u32 msgkey, struct task_struct *task, u8 node)
+                           u32 msgkey, struct task_struct *task, u8 node)
 {
        INIT_LIST_HEAD(&nst->st_net_debug_item);
        nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
        do_gettimeofday(&nst->st_sock_time);
 }
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
        do_gettimeofday(&nst->st_send_time);
 }
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
        do_gettimeofday(&nst->st_status_time);
 }
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
                                         struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
        nst->st_id = msg_id;
 }
+#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+                                  u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+                                                struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
+{
+}
 #endif /* CONFIG_DEBUG_FS */
 static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b77..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
        struct timeval                  st_send_time;
        struct timeval                  st_status_time;
 };
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                    u32 msgkey, struct task_struct *task, u8 node);
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                  struct o2net_sock_container *sc);
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
 #else
 struct o2net_send_tracking {
        u32     dummy;
 };
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
-{
-}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                                struct o2net_sock_container *sc)
-{
-}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
-                                        u32 msg_id)
-{
-}
 #endif  /* CONFIG_DEBUG_FS */
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a1875848080..026e6eb85187 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
+static struct buffer_head *ocfs2_bread(struct inode *inode,
+                                       int block, int *err, int reada)
+{
+        struct buffer_head *bh = NULL;
+        int tmperr;
+        u64 p_blkno;
+        int readflags = 0;
+        if (reada)
+                readflags |= OCFS2_BH_READAHEAD;
+        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+            i_size_read(inode)) {
+                BUG_ON(!reada);
+                return NULL;
+        }
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+                                             NULL);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        if (tmperr < 0) {
+                mlog_errno(tmperr);
+                goto fail;
+        }
+        tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+        if (tmperr < 0)
+                goto fail;
+        tmperr = 0;
+        *err = 0;
+        return bh;
+fail:
+        brelse(bh);
+        bh = NULL;
+        *err = -EIO;
+        return NULL;
+}
 /*
 * bh passed here can be an inode block or a dir data block, depending
 * on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
+        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
-                               &di_bh, OCFS2_BH_CACHED, dir);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -260,14 +302,13 @@ restart:
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
-                wait_on_buffer(bh);
+                if (ocfs2_read_block(dir, block, &bh)) {
-                if (!buffer_uptodate(bh)) {
+                        /* read error, skip block & hope for the best.
-                        /* read error, skip block & hope for the best */
+                         * ocfs2_read_block() has released the bh. */
                        ocfs2_error(dir->i_sb, "reading directory %llu, "
                                    "offset %lu\n",
                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
                                    block);
-                        brelse(bh);
                        goto next;
                }
                i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
+        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
-                               &di_bh, OCFS2_BH_CACHED, dir);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        struct ocfs2_inline_data *data;
        struct ocfs2_dir_entry *de;
-        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
-                               &di_bh, OCFS2_BH_CACHED, inode);
        if (ret) {
                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
                                tmp = ocfs2_bread(inode, ++blk, &err, 1);
-                                if (tmp)
+                                brelse(tmp);
-                                        brelse(tmp);
                        }
                        last_ra_blk = blk;
                        ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
 leave:
        if (status < 0) {
                *dirent = NULL;
-                if (*dirent_bh) {
+                brelse(*dirent_bh);
-                        brelse(*dirent_bh);
+                *dirent_bh = NULL;
-                        *dirent_bh = NULL;
-                }
        }
        mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
        ret = 0;
 bail:
-        if (dirent_bh)
+        brelse(dirent_bh);
-                brelse(dirent_bh);
        mlog_exit(ret);
        return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        status = 0;
 bail:
-        if (new_bh)
+        brelse(new_bh);
-                brelse(new_bh);
        mlog_exit(status);
        return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        struct buffer_head *dirdata_bh = NULL;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
        alloc = ocfs2_clusters_for_bytes(sb, bytes);
@@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        di->i_size = cpu_to_le64(sb->s_blocksize);
        di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
-        dir->i_blocks = ocfs2_inode_sector_count(dir);
        /*
         * This should never fail as our extent list is empty and all
         * related blocks have been journaled already.
         */
-        ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
+        ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
-                                  NULL);
+                                  0, NULL);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                goto out_commit;
        }
+        /*
+         * Set i_blocks after the extent insert for the most up to
+         * date ip_clusters value.
+         */
+        dir->i_blocks = ocfs2_inode_sector_count(dir);
        ret = ocfs2_journal_dirty(handle, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                }
                blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
-                ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
+                ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
-                                          len, 0, NULL);
+                                          blkno, len, 0, NULL);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        goto out_commit;
                }
        }
@@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        if (extend) {
                u32 offset = OCFS2_I(dir)->ip_clusters;
-                status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
-                                                    1, 0, parent_fe_bh, handle,
+                                              1, 0, parent_fe_bh, handle,
-                                                    data_ac, meta_ac, NULL);
+                                              data_ac, meta_ac, NULL);
                BUG_ON(status == -EAGAIN);
                if (status < 0) {
                        mlog_errno(status);
@@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        int credits, num_free_extents, drop_alloc_sem = 0;
        loff_t dir_i_size;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+        struct ocfs2_extent_list *el = &fe->id2.i_list;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
        handle_t *handle = NULL;
        struct buffer_head *new_bh = NULL;
        struct ocfs2_dir_entry * de;
        struct super_block *sb = osb->sb;
+        struct ocfs2_extent_tree et;
        mlog_entry_void();
@@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        spin_lock(&OCFS2_I(dir)->ip_lock);
        if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
-                num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+                ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+                num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
                if (num_free_extents < 0) {
                        status = num_free_extents;
                        mlog_errno(status);
@@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                }
                if (!num_free_extents) {
-                        status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+                        status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
                        if (status < 0) {
                                if (status != -ENOSPC)
                                        mlog_errno(status);
@@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
                        goto bail;
                }
-                credits = ocfs2_calc_extend_credits(sb, fe, 1);
+                credits = ocfs2_calc_extend_credits(sb, el, 1);
        } else {
                spin_unlock(&OCFS2_I(dir)->ip_lock);
                credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1563,8 +1608,7 @@ bail:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
-        if (new_bh)
+        brelse(new_bh);
-                brelse(new_bh);
        mlog_exit(status);
        return status;
@@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        status = 0;
 bail:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(status);
        return status;
@@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
        *ret_de_bh = bh;
        bh = NULL;
 out:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        return ret;
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e4..ec684426034b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
        } else {
                /* Boo, we have to go to disk. */
                /* read bh, cast, ocfs2_refresh_inode */
-                status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
+                status = ocfs2_read_block(inode, oi->ip_blkno, bh);
-                                          bh, OCFS2_BH_CACHED, inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_refresh;
@@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
                return 0;
        }
-        status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
-                                  OCFS2_I(inode)->ip_blkno,
-                                  ret_bh,
-                                  OCFS2_BH_CACHED,
-                                  inode);
        if (status < 0)
                mlog_errno(status);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326fe..2baedac58234 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/fiemap.h>
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
@@ -282,6 +284,50 @@ out:
                kfree(new_emi);
 }
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+                                  struct ocfs2_dinode *di)
+{
+        int ret, next_free;
+        u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_list *el;
+        ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+        el = &eb->h_list;
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                ret = -EROFS;
+                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                goto out;
+        }
+        if (el->l_tree_depth) {
+                ocfs2_error(inode->i_sb,
+                            "Inode %lu has non zero tree depth in "
+                            "leaf block %llu\n", inode->i_ino,
+                            (unsigned long long)eb_bh->b_blocknr);
+                ret = -EROFS;
+                goto out;
+        }
+        next_free = le16_to_cpu(el->l_next_free_rec);
+        if (next_free == 0 ||
+            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+                ret = 1;
+out:
+        brelse(eb_bh);
+        return ret;
+}
 /*
 * Return the 1st index within el which contains an extent start
 * larger than v_cluster.
@@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
                        goto no_more_extents;
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                ret = ocfs2_read_block(inode,
                                       le64_to_cpu(eb->h_next_leaf_blk),
-                                       &next_eb_bh, OCFS2_BH_CACHED, inode);
+                                       &next_eb_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -373,42 +419,28 @@ out:
        return ret;
 }
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+static int ocfs2_get_clusters_nocache(struct inode *inode,
-                       u32 *p_cluster, u32 *num_clusters,
+                                      struct buffer_head *di_bh,
-                       unsigned int *extent_flags)
+                                      u32 v_cluster, unsigned int *hole_len,
+                                      struct ocfs2_extent_rec *ret_rec,
+                                      unsigned int *is_last)
 {
-        int ret, i;
+        int i, ret, tree_height, len;
-        unsigned int flags = 0;
-        struct buffer_head *di_bh = NULL;
-        struct buffer_head *eb_bh = NULL;
        struct ocfs2_dinode *di;
-        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_block *uninitialized_var(eb);
        struct ocfs2_extent_list *el;
        struct ocfs2_extent_rec *rec;
-        u32 coff;
+        struct buffer_head *eb_bh = NULL;
-        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                ret = -ERANGE;
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
-                                      num_clusters, extent_flags);
-        if (ret == 0)
-                goto out;
-        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+        memset(ret_rec, 0, sizeof(*ret_rec));
-                               &di_bh, OCFS2_BH_CACHED, inode);
+        if (is_last)
-        if (ret) {
+                *is_last = 0;
-                mlog_errno(ret);
-                goto out;
-        }
        di = (struct ocfs2_dinode *) di_bh->b_data;
        el = &di->id2.i_list;
+        tree_height = le16_to_cpu(el->l_tree_depth);
-        if (el->l_tree_depth) {
+        if (tree_height > 0) {
                ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
                if (ret) {
                        mlog_errno(ret);
@@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
        i = ocfs2_search_extent_list(el, v_cluster);
        if (i == -1) {
                /*
-                 * A hole was found. Return some canned values that
+                 * Holes can be larger than the maximum size of an
-                 * callers can key on. If asked for, num_clusters will
+                 * extent, so we return their lengths in a seperate
-                 * be populated with the size of the hole.
+                 * field.
                 */
-                *p_cluster = 0;
+                if (hole_len) {
-                if (num_clusters) {
                        ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-                                                         v_cluster,
+                                                         v_cluster, &len);
-                                                         num_clusters);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
+                        *hole_len = len;
+                }
+                goto out_hole;
+        }
+        rec = &el->l_recs[i];
+        BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+        if (!rec->e_blkno) {
+                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                            "record (%u, %u, 0)", inode->i_ino,
+                            le32_to_cpu(rec->e_cpos),
+                            ocfs2_rec_clusters(el, rec));
+                ret = -EROFS;
+                goto out;
+        }
+        *ret_rec = *rec;
+        /*
+         * Checking for last extent is potentially expensive - we
+         * might have to look at the next leaf over to see if it's
+         * empty.
+         *
+         * The first two checks are to see whether the caller even
+         * cares for this information, and if the extent is at least
+         * the last in it's list.
+         *
+         * If those hold true, then the extent is last if any of the
+         * additional conditions hold true:
+         *  - Extent list is in-inode
+         *  - Extent list is right-most
+         *  - Extent list is 2nd to rightmost, with empty right-most
+         */
+        if (is_last) {
+                if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+                        if (tree_height == 0)
+                                *is_last = 1;
+                        else if (eb->h_blkno == di->i_last_eb_blk)
+                                *is_last = 1;
+                        else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+                                ret = ocfs2_last_eb_is_empty(inode, di);
+                                if (ret < 0) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                if (ret == 1)
+                                        *is_last = 1;
+                        }
+                }
+        }
+out_hole:
+        ret = 0;
+out:
+        brelse(eb_bh);
+        return ret;
+}
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+                                          u32 v_cluster,
+                                          struct ocfs2_extent_rec *rec,
+                                          u32 *p_cluster, u32 *num_clusters)
+{
+        u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+        *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+        *p_cluster = *p_cluster + coff;
+        if (num_clusters)
+                *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+                             u32 *p_cluster, u32 *num_clusters,
+                             struct ocfs2_extent_list *el)
+{
+        int ret = 0, i;
+        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_rec *rec;
+        u32 coff;
+        if (el->l_tree_depth) {
+                ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
                }
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+                el = &eb->h_list;
+                if (el->l_tree_depth) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has non zero tree depth in "
+                                    "xattr leaf block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
+                }
+        }
+        i = ocfs2_search_extent_list(el, v_cluster);
+        if (i == -1) {
+                ret = -EROFS;
+                mlog_errno(ret);
+                goto out;
        } else {
                rec = &el->l_recs[i];
                BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
                if (!rec->e_blkno) {
                        ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-                                    "record (%u, %u, 0)", inode->i_ino,
+                                    "record (%u, %u, 0) in xattr", inode->i_ino,
                                    le32_to_cpu(rec->e_cpos),
                                    ocfs2_rec_clusters(el, rec));
                        ret = -EROFS;
                        goto out;
                }
                coff = v_cluster - le32_to_cpu(rec->e_cpos);
                *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
                                                    le64_to_cpu(rec->e_blkno));
                *p_cluster = *p_cluster + coff;
                if (num_clusters)
                        *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+        }
+out:
+        if (eb_bh)
+                brelse(eb_bh);
+        return ret;
+}
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+                       u32 *p_cluster, u32 *num_clusters,
+                       unsigned int *extent_flags)
+{
+        int ret;
+        unsigned int uninitialized_var(hole_len), flags = 0;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_extent_rec rec;
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                ret = -ERANGE;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+                                      num_clusters, extent_flags);
+        if (ret == 0)
+                goto out;
+        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
-                flags = rec->e_flags;
+        ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+                                         &rec, NULL);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (rec.e_blkno == 0ULL) {
+                /*
+                 * A hole was found. Return some canned values that
+                 * callers can key on. If asked for, num_clusters will
+                 * be populated with the size of the hole.
+                 */
+                *p_cluster = 0;
+                if (num_clusters) {
+                        *num_clusters = hole_len;
+                }
+        } else {
+                ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+                                              p_cluster, num_clusters);
+                flags = rec.e_flags;
-                ocfs2_extent_map_insert_rec(inode, rec);
+                ocfs2_extent_map_insert_rec(inode, &rec);
        }
        if (extent_flags)
@@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 out:
        brelse(di_bh);
-        brelse(eb_bh);
        return ret;
 }
@@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 out:
        return ret;
 }
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+                               struct fiemap_extent_info *fieinfo,
+                               u64 map_start)
+{
+        int ret;
+        unsigned int id_count;
+        struct ocfs2_dinode *di;
+        u64 phys;
+        u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        id_count = le16_to_cpu(di->id2.i_data.id_count);
+        if (map_start < id_count) {
+                phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+                phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+                ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+                                              flags);
+                if (ret < 0)
+                        return ret;
+        }
+        return 0;
+}
+#define OCFS2_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 u64 map_start, u64 map_len)
+{
+        int ret, is_last;
+        u32 mapping_end, cpos;
+        unsigned int hole_size;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        u64 len_bytes, phys_bytes, virt_bytes;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_extent_rec rec;
+        ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+        if (ret)
+                return ret;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        /*
+         * Handle inline-data separately.
+         */
+        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+                goto out_unlock;
+        }
+        cpos = map_start >> osb->s_clustersize_bits;
+        mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+                                               map_start + map_len);
+        mapping_end -= cpos;
+        is_last = 0;
+        while (cpos < mapping_end && !is_last) {
+                u32 fe_flags;
+                ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+                                                 &hole_size, &rec, &is_last);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (rec.e_blkno == 0ULL) {
+                        cpos += hole_size;
+                        continue;
+                }
+                fe_flags = 0;
+                if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+                        fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+                if (is_last)
+                        fe_flags |= FIEMAP_EXTENT_LAST;
+                len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+                phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+                virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+                ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+                                              len_bytes, fe_flags);
+                if (ret)
+                        break;
+                cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+        }
+        if (ret > 0)
+                ret = 0;
+out_unlock:
+        brelse(di_bh);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        ocfs2_inode_unlock(inode, 0);
+out:
+        return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a22..1c4aa8b06f34 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
                                u64 *ret_count, unsigned int *extent_flags);
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                 u64 map_start, u64 map_len);
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+                             u32 *p_cluster, u32 *num_clusters,
+                             struct ocfs2_extent_list *el);
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ec2ed15c3daa..8d3225a78073 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
                goto bail;
        journal = osb->journal->j_journal;
-        err = journal_force_commit(journal);
+        err = jbd2_journal_force_commit(journal);
 bail:
        mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
 }
 /*
- * extend allocation only here.
+ * extend file allocation only here.
 * we'll update all the disk stuff, and oip->alloc_size
 *
 * expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
 * Will return -EAGAIN, and a reason if a restart is needed.
 * If passed in, *reason will always be set, even in error.
 */
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
-                               struct inode *inode,
+                         struct inode *inode,
-                               u32 *logical_offset,
+                         u32 *logical_offset,
-                               u32 clusters_to_add,
+                         u32 clusters_to_add,
-                               int mark_unwritten,
+                         int mark_unwritten,
-                               struct buffer_head *fe_bh,
+                         struct buffer_head *fe_bh,
-                               handle_t *handle,
+                         handle_t *handle,
-                               struct ocfs2_alloc_context *data_ac,
+                         struct ocfs2_alloc_context *data_ac,
-                               struct ocfs2_alloc_context *meta_ac,
+                         struct ocfs2_alloc_context *meta_ac,
-                               enum ocfs2_alloc_restarted *reason_ret)
+                         enum ocfs2_alloc_restarted *reason_ret)
 {
-        int status = 0;
+        int ret;
-        int free_extents;
+        struct ocfs2_extent_tree et;
-        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        enum ocfs2_alloc_restarted reason = RESTART_NONE;
-        u32 bit_off, num_bits;
-        u64 block;
-        u8 flags = 0;
-        BUG_ON(!clusters_to_add);
-        if (mark_unwritten)
-                flags = OCFS2_EXT_UNWRITTEN;
-        free_extents = ocfs2_num_free_extents(osb, inode, fe);
-        if (free_extents < 0) {
-                status = free_extents;
-                mlog_errno(status);
-                goto leave;
-        }
-        /* there are two cases which could cause us to EAGAIN in the
-         * we-need-more-metadata case:
-         * 1) we haven't reserved *any*
-         * 2) we are so fragmented, we've needed to add metadata too
-         *    many times. */
-        if (!free_extents && !meta_ac) {
-                mlog(0, "we haven't reserved any metadata!\n");
-                status = -EAGAIN;
-                reason = RESTART_META;
-                goto leave;
-        } else if ((!free_extents)
-                   && (ocfs2_alloc_context_bits_left(meta_ac)
-                       < ocfs2_extend_meta_needed(fe))) {
-                mlog(0, "filesystem is really fragmented...\n");
-                status = -EAGAIN;
-                reason = RESTART_META;
-                goto leave;
-        }
-        status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
-                                        clusters_to_add, &bit_off, &num_bits);
-        if (status < 0) {
-                if (status != -ENOSPC)
-                        mlog_errno(status);
-                goto leave;
-        }
-        BUG_ON(num_bits > clusters_to_add);
-        /* reserve our write early -- insert_extent may update the inode */
-        status = ocfs2_journal_access(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
-        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
-             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-                                     *logical_offset, block, num_bits,
-                                     flags, meta_ac);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
-        status = ocfs2_journal_dirty(handle, fe_bh);
-        if (status < 0) {
-                mlog_errno(status);
-                goto leave;
-        }
-        clusters_to_add -= num_bits;
-        *logical_offset += num_bits;
-        if (clusters_to_add) {
-                mlog(0, "need to alloc once more, clusters = %u, wanted = "
-                     "%u\n", fe->i_clusters, clusters_to_add);
-                status = -EAGAIN;
-                reason = RESTART_TRANS;
-        }
-leave:
-        mlog_exit(status);
-        if (reason_ret)
-                *reason_ret = reason;
-        return status;
-}
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                          u32 clusters_to_add, u32 extents_to_split,
-                          struct ocfs2_alloc_context **data_ac,
-                          struct ocfs2_alloc_context **meta_ac)
-{
-        int ret = 0, num_free_extents;
-        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        *meta_ac = NULL;
-        if (data_ac)
-                *data_ac = NULL;
-        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-             "clusters_to_add = %u, extents_to_split = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
-             le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-        num_free_extents = ocfs2_num_free_extents(osb, inode, di);
-        if (num_free_extents < 0) {
-                ret = num_free_extents;
-                mlog_errno(ret);
-                goto out;
-        }
-        /*
-         * Sparse allocation file systems need to be more conservative
-         * with reserving room for expansion - the actual allocation
-         * happens while we've got a journal handle open so re-taking
-         * a cluster lock (because we ran out of room for another
-         * extent) will violate ordering rules.
-         *
-         * Most of the time we'll only be seeing this 1 cluster at a time
-         * anyway.
-         *
-         * Always lock for any unwritten extents - we might want to
-         * add blocks during a split.
-         */
-        if (!num_free_extents ||
-            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-                ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
-                if (ret < 0) {
-                        if (ret != -ENOSPC)
-                                mlog_errno(ret);
-                        goto out;
-                }
-        }
-        if (clusters_to_add == 0)
-                goto out;
-        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
-        if (ret < 0) {
-                if (ret != -ENOSPC)
-                        mlog_errno(ret);
-                goto out;
-        }
-out:
-        if (ret) {
-                if (*meta_ac) {
-                        ocfs2_free_alloc_context(*meta_ac);
-                        *meta_ac = NULL;
-                }
-                /*
+        ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
-                 * We cannot have an error and a non null *data_ac.
+        ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
-                 */
+                                           clusters_to_add, mark_unwritten,
-        }
+                                           &et, handle,
+                                           data_ac, meta_ac, reason_ret);
        return ret;
 }
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        struct ocfs2_alloc_context *meta_ac = NULL;
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_extent_tree et;
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
         */
        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
+        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
-                                  OCFS2_BH_CACHED, inode);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
+        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-                                       &meta_ac);
+             "clusters_to_add = %u\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+             clusters_to_add);
+        ocfs2_init_dinode_extent_tree(&et, inode, bh);
+        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+                                       &data_ac, &meta_ac);
        if (status) {
                mlog_errno(status);
                goto leave;
        }
-        credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+        credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+                                            clusters_to_add);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
        prev_clusters = OCFS2_I(inode)->ip_clusters;
-        status = ocfs2_do_extend_allocation(osb,
+        status = ocfs2_add_inode_data(osb,
-                                            inode,
+                                      inode,
-                                            &logical_start,
+                                      &logical_start,
-                                            clusters_to_add,
+                                      clusters_to_add,
-                                            mark_unwritten,
+                                      mark_unwritten,
-                                            bh,
+                                      bh,
-                                            handle,
+                                      handle,
-                                            data_ac,
+                                      data_ac,
-                                            meta_ac,
+                                      meta_ac,
-                                            &why);
+                                      &why);
        if ((status < 0) && (status != -EAGAIN)) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
                        mlog(0, "restarting transaction.\n");
                        /* TODO: This can be more intelligent. */
                        credits = ocfs2_calc_extend_credits(osb->sb,
-                                                            fe,
+                                                            &fe->id2.i_list,
                                                            clusters_to_add);
                        status = ocfs2_extend_trans(handle, credits);
                        if (status < 0) {
@@ -826,10 +670,8 @@ leave:
                restart_func = 0;
                goto restart_all;
        }
-        if (bh) {
+        brelse(bh);
-                brelse(bh);
+        bh = NULL;
-                bh = NULL;
-        }
        mlog_exit(status);
        return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                        goto bail_unlock;
                }
-                if (i_size_read(inode) > attr->ia_size)
+                if (i_size_read(inode) > attr->ia_size) {
+                        if (ocfs2_should_order_data(inode)) {
+                                status = ocfs2_begin_ordered_truncate(inode,
+                                                                      attr->ia_size);
+                                if (status)
+                                        goto bail_unlock;
+                        }
                        status = ocfs2_truncate_file(inode, bh, attr->ia_size);
-                else
+                } else
                        status = ocfs2_extend_file(inode, bh, attr->ia_size);
                if (status < 0) {
                        if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
        if (size_change)
                ocfs2_rw_unlock(inode, 1);
 bail:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(status);
        return status;
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
        struct buffer_head *bh = NULL;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+        ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
-                               oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
        struct buffer_head *di_bh = NULL;
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+                ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-                                       OCFS2_I(inode)->ip_blkno, &di_bh,
+                                       &di_bh);
-                                       OCFS2_BH_CACHED, inode);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
        handle_t *handle;
        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_extent_tree et;
-        ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
        if (ret) {
                mlog_errno(ret);
                return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
                                  dealloc);
        if (ret) {
                mlog_errno(ret);
@@ -2040,7 +1888,7 @@ out_dio:
                 */
                if (old_size != i_size_read(inode) ||
                    old_clusters != OCFS2_I(inode)->ip_clusters) {
-                        ret = journal_force_commit(osb->journal->j_journal);
+                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
                }
@@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
        .permission     = ocfs2_permission,
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ocfs2_listxattr,
+        .removexattr    = generic_removexattr,
        .fallocate      = ocfs2_fallocate,
+        .fiemap         = ocfs2_fiemap,
 };
 const struct inode_operations ocfs2_special_file_iops = {
@@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
        .permission     = ocfs2_permission,
 };
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
 const struct file_operations ocfs2_fops = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+        .lock           = ocfs2_lock,
        .flock          = ocfs2_flock,
        .splice_read    = ocfs2_file_splice_read,
        .splice_write   = ocfs2_file_splice_write,
@@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+        .lock           = ocfs2_lock,
+        .flock          = ocfs2_flock,
+};
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .mmap           = ocfs2_mmap,
+        .fsync          = ocfs2_sync_file,
+        .release        = ocfs2_file_release,
+        .open           = ocfs2_file_open,
+        .aio_read       = ocfs2_file_aio_read,
+        .aio_write      = ocfs2_file_aio_write,
+        .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+        .flock          = ocfs2_flock,
+        .splice_read    = ocfs2_file_splice_read,
+        .splice_write   = ocfs2_file_splice_write,
+};
+const struct file_operations ocfs2_dops_no_plocks = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = ocfs2_readdir,
+        .fsync          = ocfs2_sync_file,
+        .release        = ocfs2_dir_release,
+        .open           = ocfs2_dir_open,
+        .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ocfs2_compat_ioctl,
+#endif
        .flock          = ocfs2_flock,
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1e27b4d017ea..e92382cbca5f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
 extern const struct file_operations ocfs2_fops;
 extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
 struct ocfs2_file_private {
        struct file             *fp_file;
@@ -38,27 +41,18 @@ struct ocfs2_file_private {
        struct ocfs2_lock_res   fp_flock;
 };
-enum ocfs2_alloc_restarted {
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
-        RESTART_NONE = 0,
+                         struct inode *inode,
-        RESTART_TRANS,
+                         u32 *logical_offset,
-        RESTART_META
+                         u32 clusters_to_add,
-};
+                         int mark_unwritten,
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
+                         struct buffer_head *fe_bh,
-                               struct inode *inode,
+                         handle_t *handle,
-                               u32 *logical_offset,
+                         struct ocfs2_alloc_context *data_ac,
-                               u32 clusters_to_add,
+                         struct ocfs2_alloc_context *meta_ac,
-                               int mark_unwritten,
+                         enum ocfs2_alloc_restarted *reason_ret);
-                               struct buffer_head *fe_bh,
-                               handle_t *handle,
-                               struct ocfs2_alloc_context *data_ac,
-                               struct ocfs2_alloc_context *meta_ac,
-                               enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
                          u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-                          u32 clusters_to_add, u32 extents_to_split,
-                          struct ocfs2_alloc_context **data_ac,
-                          struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..4903688f72a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        struct super_block *sb;
        struct ocfs2_super *osb;
        int status = -EINVAL;
+        int use_plocks = 1;
        mlog_entry("(0x%p, size:%llu)\n", inode,
                   (unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        sb = inode->i_sb;
        osb = OCFS2_SB(sb);
+        if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+            ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+                use_plocks = 0;
        /* this means that read_inode cannot create a superblock inode
         * today.  change if needed. */
        if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        switch (inode->i_mode & S_IFMT) {
            case S_IFREG:
-                    inode->i_fop = &ocfs2_fops;
+                    if (use_plocks)
+                            inode->i_fop = &ocfs2_fops;
+                    else
+                            inode->i_fop = &ocfs2_fops_no_plocks;
                    inode->i_op = &ocfs2_file_iops;
                    i_size_write(inode, le64_to_cpu(fe->i_size));
                    break;
            case S_IFDIR:
                    inode->i_op = &ocfs2_dir_iops;
-                    inode->i_fop = &ocfs2_dops;
+                    if (use_plocks)
+                            inode->i_fop = &ocfs2_dops;
+                    else
+                            inode->i_fop = &ocfs2_dops_no_plocks;
                    i_size_write(inode, le64_to_cpu(fe->i_size));
                    break;
            case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
-        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
+        if (can_lock)
-                                  can_lock ? inode : NULL);
+                status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+                                           OCFS2_BH_IGNORE_CACHE);
+        else
+                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
         * data and fast symlinks.
         */
        if (fe->i_clusters) {
+                if (ocfs2_should_order_data(inode))
+                        ocfs2_begin_ordered_truncate(inode, 0);
                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
                goto bail_unlock_dir;
        }
+        /*Free extended attribute resources associated with this inode.*/
+        status = ocfs2_xattr_remove(inode, di_bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_unlock_dir;
+        }
        status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
                                    orphan_dir_bh);
        if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
        oi->ip_last_trans = 0;
        oi->ip_dir_start_lookup = 0;
        oi->ip_blkno = 0ULL;
+        jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+                                       &oi->ip_jinode);
 bail:
        mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
 }
 /*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
-                                int block, int *err, int reada)
-{
-        struct buffer_head *bh = NULL;
-        int tmperr;
-        u64 p_blkno;
-        int readflags = OCFS2_BH_CACHED;
-        if (reada)
-                readflags |= OCFS2_BH_READAHEAD;
-        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-            i_size_read(inode)) {
-                BUG_ON(!reada);
-                return NULL;
-        }
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-                                             NULL);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
-        if (tmperr < 0) {
-                mlog_errno(tmperr);
-                goto fail;
-        }
-        tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
-                                  readflags, inode);
-        if (tmperr < 0)
-                goto fail;
-        tmperr = 0;
-        *err = 0;
-        return bh;
-fail:
-        if (bh) {
-                brelse(bh);
-                bh = NULL;
-        }
-        *err = -EIO;
-        return NULL;
-}
-/*
 * This is called from our getattr.
 */
 int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa0..2f37af9bcc4a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
        /* protects allocation changes on this inode. */
        struct rw_semaphore             ip_alloc_sem;
+        /* protects extended attribute changes on this inode */
+        struct rw_semaphore             ip_xattr_sem;
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
        struct ocfs2_extent_map         ip_extent_map;
        struct inode                    vfs_inode;
+        struct jbd2_inode               ip_jinode;
 };
 /*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
 extern const struct address_space_operations ocfs2_aops;
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
-                                int *err, int reada);
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce995..9fcd36dcc9a0 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
 bail:
        mutex_unlock(&inode->i_mutex);
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(status);
        return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7a37240f7a31..81e40677eecb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
                goto finally;
        }
-        journal_lock_updates(journal->j_journal);
+        jbd2_journal_lock_updates(journal->j_journal);
-        status = journal_flush(journal->j_journal);
+        status = jbd2_journal_flush(journal->j_journal);
-        journal_unlock_updates(journal->j_journal);
+        jbd2_journal_unlock_updates(journal->j_journal);
        if (status < 0) {
                up_write(&journal->j_trans_barrier);
                mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
        down_read(&osb->journal->j_trans_barrier);
-        handle = journal_start(journal, max_buffs);
+        handle = jbd2_journal_start(journal, max_buffs);
        if (IS_ERR(handle)) {
                up_read(&osb->journal->j_trans_barrier);
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
        BUG_ON(!handle);
-        ret = journal_stop(handle);
+        ret = jbd2_journal_stop(handle);
        if (ret < 0)
                mlog_errno(ret);
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 * transaction. extend_trans will either extend the current handle by
 * nblocks, or commit it and start a new one with nblocks credits.
 *
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
 * and then restart the transaction. Before calling
 * ocfs2_extend_trans(), any changed blocks should have been
 * dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #ifdef CONFIG_OCFS2_DEBUG_FS
        status = 1;
 #else
-        status = journal_extend(handle, nblocks);
+        status = jbd2_journal_extend(handle, nblocks);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif
        if (status > 0) {
-                mlog(0, "journal_extend failed, trying journal_restart\n");
+                mlog(0,
-                status = journal_restart(handle, nblocks);
+                     "jbd2_journal_extend failed, trying "
+                     "jbd2_journal_restart\n");
+                status = jbd2_journal_restart(handle, nblocks);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
        switch (type) {
        case OCFS2_JOURNAL_ACCESS_CREATE:
        case OCFS2_JOURNAL_ACCESS_WRITE:
-                status = journal_get_write_access(handle, bh);
+                status = jbd2_journal_get_write_access(handle, bh);
                break;
        case OCFS2_JOURNAL_ACCESS_UNDO:
-                status = journal_get_undo_access(handle, bh);
+                status = jbd2_journal_get_undo_access(handle, bh);
                break;
        default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
        mlog_entry("(bh->b_blocknr=%llu)\n",
                   (unsigned long long)bh->b_blocknr);
-        status = journal_dirty_metadata(handle, bh);
+        status = jbd2_journal_dirty_metadata(handle, bh);
        if (status < 0)
                mlog(ML_ERROR, "Could not dirty metadata buffer. "
                     "(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
        return status;
 }
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int ocfs2_journal_dirty_data(handle_t *handle,
                             struct buffer_head *bh)
 {
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
        return err;
 }
+#endif
-#define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
        spin_lock(&journal->j_state_lock);
        journal->j_commit_interval = commit_interval;
        if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-                journal->j_flags |= JFS_BARRIER;
+                journal->j_flags |= JBD2_BARRIER;
        else
-                journal->j_flags &= ~JFS_BARRIER;
+                journal->j_flags &= ~JBD2_BARRIER;
        spin_unlock(&journal->j_state_lock);
 }
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
        /* call the kernels journal init function now */
-        j_journal = journal_init_inode(inode);
+        j_journal = jbd2_journal_init_inode(inode);
        if (j_journal == NULL) {
                mlog(ML_ERROR, "Linux journal layer error\n");
                status = -EINVAL;
                goto done;
        }
-        mlog(0, "Returned from journal_init_inode\n");
+        mlog(0, "Returned from jbd2_journal_init_inode\n");
        mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
        *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
        if (status < 0) {
                if (inode_lock)
                        ocfs2_inode_unlock(inode, 1);
-                if (bh != NULL)
+                brelse(bh);
-                        brelse(bh);
                if (inode) {
                        OCFS2_I(inode)->ip_open_count--;
                        iput(inode);
@@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        if (journal->j_state != OCFS2_JOURNAL_LOADED)
                goto done;
-        /* need to inc inode use count as journal_destroy will iput. */
+        /* need to inc inode use count - jbd2_journal_destroy will iput. */
        if (!igrab(inode))
                BUG();
@@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
        if (ocfs2_mount_local(osb)) {
-                journal_lock_updates(journal->j_journal);
+                jbd2_journal_lock_updates(journal->j_journal);
-                status = journal_flush(journal->j_journal);
+                status = jbd2_journal_flush(journal->j_journal);
-                journal_unlock_updates(journal->j_journal);
+                jbd2_journal_unlock_updates(journal->j_journal);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        }
        /* Shutdown the kernel journal system */
-        journal_destroy(journal->j_journal);
+        jbd2_journal_destroy(journal->j_journal);
        OCFS2_I(inode)->ip_open_count--;
@@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
 {
        int olderr;
-        olderr = journal_errno(journal);
+        olderr = jbd2_journal_errno(journal);
        if (olderr) {
                mlog(ML_ERROR, "File system error %d recorded in "
                     "journal %u.\n", olderr, slot);
                mlog(ML_ERROR, "File system on device %s needs checking.\n",
                     sb->s_id);
-                journal_ack_err(journal);
+                jbd2_journal_ack_err(journal);
-                journal_clear_err(journal);
+                jbd2_journal_clear_err(journal);
        }
 }
@@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
        osb = journal->j_osb;
-        status = journal_load(journal->j_journal);
+        status = jbd2_journal_load(journal->j_journal);
        if (status < 0) {
                mlog(ML_ERROR, "Failed to load journal!\n");
                goto done;
@@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
        BUG_ON(!journal);
-        status = journal_wipe(journal->j_journal, full);
+        status = jbd2_journal_wipe(journal->j_journal, full);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
                /* We are reading journal data which should not
                 * be put in the uptodate cache */
-                status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
+                status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
-                                           p_blkno, p_blocks, bhs, 0,
+                                                p_blkno, p_blocks, bhs);
-                                           NULL);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 bail:
        for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
-                if (bhs[i])
+                brelse(bhs[i]);
-                        brelse(bhs[i]);
        mlog_exit(status);
        return status;
 }
@@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+                                   OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        }
        mlog(0, "calling journal_init_inode\n");
-        journal = journal_init_inode(inode);
+        journal = jbd2_journal_init_inode(inode);
        if (journal == NULL) {
                mlog(ML_ERROR, "Linux journal layer error\n");
                status = -EIO;
                goto done;
        }
-        status = journal_load(journal);
+        status = jbd2_journal_load(journal);
        if (status < 0) {
                mlog_errno(status);
                if (!igrab(inode))
                        BUG();
-                journal_destroy(journal);
+                jbd2_journal_destroy(journal);
                goto done;
        }
@@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        /* wipe the journal */
        mlog(0, "flushing the journal.\n");
-        journal_lock_updates(journal);
+        jbd2_journal_lock_updates(journal);
-        status = journal_flush(journal);
+        status = jbd2_journal_flush(journal);
-        journal_unlock_updates(journal);
+        jbd2_journal_unlock_updates(journal);
        if (status < 0)
                mlog_errno(status);
@@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        if (!igrab(inode))
                BUG();
-        journal_destroy(journal);
+        jbd2_journal_destroy(journal);
 done:
        /* drop the lock on this nodes journal */
@@ -1282,8 +1284,7 @@ done:
        if (inode)
                iput(inode);
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(status);
        return status;
@@ -1418,13 +1419,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
        unsigned int node_num;
        int status, i;
+        u32 gen;
        struct buffer_head *bh = NULL;
        struct ocfs2_dinode *di;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
-        spin_lock(&osb->osb_lock);
        for (i = 0; i < osb->max_slots; i++) {
                /* Read journal inode to get the recovery generation */
                status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
@@ -1433,23 +1434,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                        goto bail;
                }
                di = (struct ocfs2_dinode *)bh->b_data;
-                osb->slot_recovery_generations[i] =
+                gen = ocfs2_get_recovery_generation(di);
-                                        ocfs2_get_recovery_generation(di);
                brelse(bh);
                bh = NULL;
+                spin_lock(&osb->osb_lock);
+                osb->slot_recovery_generations[i] = gen;
                mlog(0, "Slot %u recovery generation is %u\n", i,
                     osb->slot_recovery_generations[i]);
-                if (i == osb->slot_num)
+                if (i == osb->slot_num) {
+                        spin_unlock(&osb->osb_lock);
                        continue;
+                }
                status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
-                if (status == -ENOENT)
+                if (status == -ENOENT) {
+                        spin_unlock(&osb->osb_lock);
                        continue;
+                }
-                if (__ocfs2_recovery_map_test(osb, node_num))
+                if (__ocfs2_recovery_map_test(osb, node_num)) {
+                        spin_unlock(&osb->osb_lock);
                        continue;
+                }
                spin_unlock(&osb->osb_lock);
                /* Ok, we have a slot occupied by another node which
@@ -1465,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                        mlog_errno(status);
                        goto bail;
                }
-                spin_lock(&osb->osb_lock);
        }
-        spin_unlock(&osb->osb_lock);
        status = 0;
 bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2178ebffa05f..d4d14e9a3cea 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
 #define OCFS2_JOURNAL_H
 #include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 enum ocfs2_journal_state {
        OCFS2_JOURNAL_FREE = 0,
@@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
 *                          buffer. Will have to call ocfs2_journal_dirty once
 *                          we've actually dirtied it. Type is one of . or .
 *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
- *                             the current handle commits.
+ *                           the current handle commits.
 */
 /* You must always start_trans with a number of buffs > 0, but it's
@@ -268,8 +273,10 @@ int                  ocfs2_journal_access(handle_t *handle,
 */
 int                  ocfs2_journal_dirty(handle_t *handle,
                                         struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int                  ocfs2_journal_dirty_data(handle_t *handle,
                                              struct buffer_head *bh);
+#endif
 /*
 *  Credit Macros:
@@ -283,6 +290,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -340,11 +350,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
                             + OCFS2_UNLINK_CREDITS)
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+                                          + OCFS2_INODE_UPDATE_CREDITS \
+                                          + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-                                            struct ocfs2_dinode *fe,
+                                            struct ocfs2_extent_list *root_el,
                                            u32 bits_wanted)
 {
-        int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+        int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
        /* bitmap dinode, group desc. + relinked group. */
        bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
         * however many metadata chunks needed * a remaining suballoc
         * alloc. */
        sysfile_bitmap_blocks = 1 +
-                (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+                (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
        /* this does not include *new* metadata blocks, which are
-         * accounted for in sysfile_bitmap_blocks. fe +
+         * accounted for in sysfile_bitmap_blocks. root_el +
         * prev. last_eb_blk + blocks along edge of tree.
         * calc_symlink_credits passes because we just need 1
         * credit for the dinode there. */
-        dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+        extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
-        return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
 }
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        return credits;
 }
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+        return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+                                               loff_t new_size)
+{
+        return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+                                                   new_size);
+}
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec88..687b28713c32 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -47,8 +48,6 @@
 #define OCFS2_LOCAL_ALLOC(dinode)       (&((dinode)->id2.i_lab))
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+#ifdef CONFIG_OCFS2_FS_STATS
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
+#define LA_DEBUG_VER    1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+                                   size_t count, loff_t *ppos)
+{
+        static DEFINE_MUTEX(la_debug_mutex);
+        struct ocfs2_super *osb = file->private_data;
+        int written, ret;
+        char *buf = osb->local_alloc_debug_buf;
+        mutex_lock(&la_debug_mutex);
+        memset(buf, 0, LA_DEBUG_BUF_SZ);
+        written = snprintf(buf, LA_DEBUG_BUF_SZ,
+                           "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+                           LA_DEBUG_VER,
+                           (unsigned long long)osb->la_last_gd,
+                           osb->local_alloc_default_bits,
+                           osb->local_alloc_bits, osb->local_alloc_state);
+        ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+        mutex_unlock(&la_debug_mutex);
+        return ret;
+}
+static const struct file_operations ocfs2_la_debug_fops = {
+        .open =         ocfs2_la_debug_open,
+        .read =         ocfs2_la_debug_read,
+};
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+        osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+        if (!osb->local_alloc_debug_buf)
+                return;
+        osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+                                                     S_IFREG|S_IRUSR,
+                                                     osb->osb_debug_root,
+                                                     osb,
+                                                     &ocfs2_la_debug_fops);
+        if (!osb->local_alloc_debug) {
+                kfree(osb->local_alloc_debug_buf);
+                osb->local_alloc_debug_buf = NULL;
+        }
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+        if (osb->local_alloc_debug)
+                debugfs_remove(osb->local_alloc_debug);
+        if (osb->local_alloc_debug_buf)
+                kfree(osb->local_alloc_debug_buf);
+        osb->local_alloc_debug_buf = NULL;
+        osb->local_alloc_debug = NULL;
+}
+#else   /* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+        return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+        return;
+}
+#endif
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
-        BUG_ON(osb->s_clustersize_bits > 20);
+        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+                osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
-        /* Size local alloc windows by the megabyte */
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
-        return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
+                                      unsigned int num_clusters)
+{
+        spin_lock(&osb->osb_lock);
+        if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+            osb->local_alloc_state == OCFS2_LA_THROTTLED)
+                if (num_clusters >= osb->local_alloc_default_bits) {
+                        cancel_delayed_work(&osb->la_enable_wq);
+                        osb->local_alloc_state = OCFS2_LA_ENABLED;
+                }
+        spin_unlock(&osb->osb_lock);
+}
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+        struct ocfs2_super *osb =
+                container_of(work, struct ocfs2_super,
+                             la_enable_wq.work);
+        spin_lock(&osb->osb_lock);
+        osb->local_alloc_state = OCFS2_LA_ENABLED;
+        spin_unlock(&osb->osb_lock);
 }
 /*
 * Tell us whether a given allocation should use the local alloc
 * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
 */
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
-        int la_bits = ocfs2_local_alloc_window_bits(osb);
        int ret = 0;
+        int la_bits;
+        spin_lock(&osb->osb_lock);
+        la_bits = osb->local_alloc_bits;
-        if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+        if (!ocfs2_la_state_enabled(osb))
                goto bail;
        /* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 bail:
        mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
             osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+        spin_unlock(&osb->osb_lock);
        return ret;
 }
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
-        if (osb->local_alloc_size == 0)
+        ocfs2_init_la_debug(osb);
+        if (osb->local_alloc_bits == 0)
                goto bail;
-        if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+        if (osb->local_alloc_bits >= osb->bitmap_cpg) {
                mlog(ML_NOTICE, "Requested local alloc window %d is larger "
                     "than max possible %u. Using defaults.\n",
-                     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
+                     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
-                osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+                osb->local_alloc_bits =
+                        ocfs2_megabytes_to_clusters(osb->sb,
+                                                    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
        }
        /* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-                                  &alloc_bh, 0, inode);
+                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 bail:
        if (status < 0)
-                if (alloc_bh)
+                brelse(alloc_bh);
-                        brelse(alloc_bh);
        if (inode)
                iput(inode);
-        mlog(0, "Local alloc window bits = %d\n",
+        if (status < 0)
-             ocfs2_local_alloc_window_bits(osb));
+                ocfs2_shutdown_la_debug(osb);
+        mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
        mlog_exit(status);
        return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        mlog_entry_void();
+        cancel_delayed_work(&osb->la_enable_wq);
+        flush_workqueue(ocfs2_wq);
+        ocfs2_shutdown_la_debug(osb);
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
@@ -295,8 +410,7 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
 out_unlock:
-        if (main_bm_bh)
+        brelse(main_bm_bh);
-                brelse(main_bm_bh);
        ocfs2_inode_unlock(main_bm_inode, 1);
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
+        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-                                  &alloc_bh, 0, inode);
+                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -372,8 +486,7 @@ bail:
                *alloc_copy = NULL;
        }
-        if (alloc_bh)
+        brelse(alloc_bh);
-                brelse(alloc_bh);
        if (inode) {
                mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
 out_mutex:
        mutex_unlock(&main_bm_inode->i_mutex);
-        if (main_bm_bh)
+        brelse(main_bm_bh);
-                brelse(main_bm_bh);
        iput(main_bm_inode);
@@ -453,8 +565,48 @@ out:
        return status;
 }
+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+                                      struct ocfs2_alloc_context *ac,
+                                      u32 bits_wanted)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_dinode *alloc;
+        struct ocfs2_local_alloc *la;
+        int start;
+        u64 block_off;
+        if (!ac->ac_max_block)
+                return 1;
+        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+        la = OCFS2_LOCAL_ALLOC(alloc);
+        start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+        if (start == -1) {
+                mlog_errno(-ENOSPC);
+                return 0;
+        }
+        /*
+         * Converting (bm_off + start + bits_wanted) to blocks gives us
+         * the blkno just past our actual allocation.  This is perfect
+         * to compare with ac_max_block.
+         */
+        block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+                                             le32_to_cpu(la->la_bm_off) +
+                                             start + bits_wanted);
+        mlog(0, "Checking %llu against %llu\n",
+             (unsigned long long)block_off,
+             (unsigned long long)ac->ac_max_block);
+        if (block_off > ac->ac_max_block)
+                return 0;
+        return 1;
+}
 /*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
 * local alloc. You lose them when you drop i_mutex.
 *
 * We will add ourselves to the transaction passed in, but may start
@@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        mutex_lock(&local_alloc_inode->i_mutex);
-        if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
+        /*
-                status = -ENOSPC;
+         * We must double check state and allocator bits because
-                goto bail;
+         * another process may have changed them while holding i_mutex.
-        }
+         */
+        spin_lock(&osb->osb_lock);
-        if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
+        if (!ocfs2_la_state_enabled(osb) ||
-                mlog(0, "Asking for more than my max window size!\n");
+            (bits_wanted > osb->local_alloc_bits)) {
+                spin_unlock(&osb->osb_lock);
                status = -ENOSPC;
                goto bail;
        }
+        spin_unlock(&osb->osb_lock);
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                                mlog_errno(status);
                        goto bail;
                }
+                /*
+                 * Under certain conditions, the window slide code
+                 * might have reduced the number of bits available or
+                 * disabled the the local alloc entirely. Re-check
+                 * here and return -ENOSPC if necessary.
+                 */
+                status = -ENOSPC;
+                if (!ocfs2_la_state_enabled(osb))
+                        goto bail;
+                free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+                        le32_to_cpu(alloc->id1.bitmap1.i_used);
+                if (bits_wanted > free_bits)
+                        goto bail;
+        }
+        if (ac->ac_max_block)
+                mlog(0, "Calling in_range for max block %llu\n",
+                     (unsigned long long)ac->ac_max_block);
+        if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+                                        bits_wanted)) {
+                /*
+                 * The window is outside ac->ac_max_block.
+                 * This errno tells the caller to keep localalloc enabled
+                 * but to get the allocation from the main bitmap.
+                 */
+                status = -EFBIG;
+                goto bail;
        }
        ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
        return status;
 }
+enum ocfs2_la_event {
+        OCFS2_LA_EVENT_SLIDE,           /* Normal window slide. */
+        OCFS2_LA_EVENT_FRAGMENTED,      /* The global bitmap has
+                                         * enough bits theoretically
+                                         * free, but a contiguous
+                                         * allocation could not be
+                                         * found. */
+        OCFS2_LA_EVENT_ENOSPC,          /* Global bitmap doesn't have
+                                         * enough bits free to satisfy
+                                         * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+                                  enum ocfs2_la_event event)
+{
+        unsigned int bits;
+        int state;
+        spin_lock(&osb->osb_lock);
+        if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+                WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+                goto out_unlock;
+        }
+        /*
+         * ENOSPC and fragmentation are treated similarly for now.
+         */
+        if (event == OCFS2_LA_EVENT_ENOSPC ||
+            event == OCFS2_LA_EVENT_FRAGMENTED) {
+                /*
+                 * We ran out of contiguous space in the primary
+                 * bitmap. Drastically reduce the number of bits used
+                 * by local alloc until we have to disable it.
+                 */
+                bits = osb->local_alloc_bits >> 1;
+                if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+                        /*
+                         * By setting state to THROTTLED, we'll keep
+                         * the number of local alloc bits used down
+                         * until an event occurs which would give us
+                         * reason to assume the bitmap situation might
+                         * have changed.
+                         */
+                        osb->local_alloc_state = OCFS2_LA_THROTTLED;
+                        osb->local_alloc_bits = bits;
+                } else {
+                        osb->local_alloc_state = OCFS2_LA_DISABLED;
+                }
+                queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+                                   OCFS2_LA_ENABLE_INTERVAL);
+                goto out_unlock;
+        }
+        /*
+         * Don't increase the size of the local alloc window until we
+         * know we might be able to fulfill the request. Otherwise, we
+         * risk bouncing around the global bitmap during periods of
+         * low space.
+         */
+        if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+                osb->local_alloc_bits = osb->local_alloc_default_bits;
+out_unlock:
+        state = osb->local_alloc_state;
+        spin_unlock(&osb->osb_lock);
+        return state;
+}
 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
                                                struct ocfs2_alloc_context **ac,
                                                struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
                goto bail;
        }
-        (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+retry_enospc:
+        (*ac)->ac_bits_wanted = osb->local_alloc_bits;
        status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+        if (status == -ENOSPC) {
+                if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+                    OCFS2_LA_DISABLED)
+                        goto bail;
+                ocfs2_free_ac_resource(*ac);
+                memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+                goto retry_enospc;
+        }
        if (status < 0) {
-                if (status != -ENOSPC)
+                mlog_errno(status);
-                        mlog_errno(status);
                goto bail;
        }
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
                     "one\n");
        mlog(0, "Allocating %u clusters for a new window.\n",
-             ocfs2_local_alloc_window_bits(osb));
+             osb->local_alloc_bits);
        /* Instruct the allocation code to try the most recently used
         * cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        /* we used the generic suballoc reserve function, but we set
         * everything up nicely, so there's no reason why we can't use
         * the more specific cluster api to claim bits. */
-        status = ocfs2_claim_clusters(osb, handle, ac,
+        status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
-                                      ocfs2_local_alloc_window_bits(osb),
                                      &cluster_off, &cluster_count);
+        if (status == -ENOSPC) {
+retry_enospc:
+                /*
+                 * Note: We could also try syncing the journal here to
+                 * allow use of any free bits which the current
+                 * transaction can't give us access to. --Mark
+                 */
+                if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+                    OCFS2_LA_DISABLED)
+                        goto bail;
+                status = ocfs2_claim_clusters(osb, handle, ac,
+                                              osb->local_alloc_bits,
+                                              &cluster_off,
+                                              &cluster_count);
+                if (status == -ENOSPC)
+                        goto retry_enospc;
+                /*
+                 * We only shrunk the *minimum* number of in our
+                 * request - it's entirely possible that the allocator
+                 * might give us more than we asked for.
+                 */
+                if (status == 0) {
+                        spin_lock(&osb->osb_lock);
+                        osb->local_alloc_bits = cluster_count;
+                        spin_unlock(&osb->osb_lock);
+                }
+        }
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        mlog_entry_void();
+        ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
        /* This will lock the main bitmap for us. */
        status = ocfs2_local_alloc_reserve_for_window(osb,
                                                      &ac,
@@ -976,8 +1277,7 @@ bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        if (main_bm_bh)
+        brelse(main_bm_bh);
-                brelse(main_bm_bh);
        if (main_bm_inode)
                iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110c..ac5ea9f86653 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
                                 u32 *bit_off,
                                 u32 *num_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+                                      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
 #endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f87143877..544ac6245175 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
 */
 #include <linux/fs.h>
+#include <linux/fcntl.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
 #include "locks.h"
 static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
        else
                return ocfs2_do_flock(file, inode, cmd, fl);
 }
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        struct inode *inode = file->f_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (!(fl->fl_flags & FL_POSIX))
+                return -ENOLCK;
+        if (__mandatory_lock(inode))
+                return -ENOLCK;
+        return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324ec..496d488b271f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
 #define OCFS2_LOCKS_H
 int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
 #endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe0140..485a6aa0ad39 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -327,14 +328,9 @@ leave:
        if (status == -ENOSPC)
                mlog(0, "Disk is full\n");
-        if (new_fe_bh)
+        brelse(new_fe_bh);
-                brelse(new_fe_bh);
+        brelse(de_bh);
+        brelse(parent_fe_bh);
-        if (de_bh)
-                brelse(de_bh);
-        if (parent_fe_bh)
-                brelse(parent_fe_bh);
        if ((status < 0) && inode)
                iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
 out:
        ocfs2_inode_unlock(dir, 1);
-        if (de_bh)
+        brelse(de_bh);
-                brelse(de_bh);
+        brelse(fe_bh);
-        if (fe_bh)
+        brelse(parent_fe_bh);
-                brelse(fe_bh);
-        if (parent_fe_bh)
-                brelse(parent_fe_bh);
        mlog_exit(err);
@@ -851,17 +844,10 @@ leave:
                iput(orphan_dir);
        }
-        if (fe_bh)
+        brelse(fe_bh);
-                brelse(fe_bh);
+        brelse(dirent_bh);
+        brelse(parent_node_bh);
-        if (dirent_bh)
+        brelse(orphan_entry_bh);
-                brelse(dirent_bh);
-        if (parent_node_bh)
-                brelse(parent_node_bh);
-        if (orphan_entry_bh)
-                brelse(orphan_entry_bh);
        mlog_exit(status);
@@ -1372,24 +1358,15 @@ bail:
        if (new_inode)
                iput(new_inode);
-        if (newfe_bh)
+        brelse(newfe_bh);
-                brelse(newfe_bh);
+        brelse(old_inode_bh);
-        if (old_inode_bh)
+        brelse(old_dir_bh);
-                brelse(old_inode_bh);
+        brelse(new_dir_bh);
-        if (old_dir_bh)
+        brelse(new_de_bh);
-                brelse(old_dir_bh);
+        brelse(old_de_bh);
-        if (new_dir_bh)
+        brelse(old_inode_de_bh);
-                brelse(new_dir_bh);
+        brelse(orphan_entry_bh);
-        if (new_de_bh)
+        brelse(insert_entry_bh);
-                brelse(new_de_bh);
-        if (old_de_bh)
-                brelse(old_de_bh);
-        if (old_inode_de_bh)
-                brelse(old_inode_de_bh);
-        if (orphan_entry_bh)
-                brelse(orphan_entry_bh);
-        if (insert_entry_bh)
-                brelse(insert_entry_bh);
        mlog_exit(status);
@@ -1492,8 +1469,7 @@ bail:
        if (bhs) {
                for(i = 0; i < blocks; i++)
-                        if (bhs[i])
+                        brelse(bhs[i]);
-                                brelse(bhs[i]);
                kfree(bhs);
        }
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
-                status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
+                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
-                                                    new_fe_bh,
+                                              new_fe_bh,
-                                                    handle, data_ac, NULL,
+                                              handle, data_ac, NULL,
-                                                    NULL);
+                                              NULL);
                if (status < 0) {
                        if (status != -ENOSPC && status != -EINTR) {
                                mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
        ocfs2_inode_unlock(dir, 1);
-        if (new_fe_bh)
+        brelse(new_fe_bh);
-                brelse(new_fe_bh);
+        brelse(parent_fe_bh);
-        if (parent_fe_bh)
+        brelse(de_bh);
-                brelse(parent_fe_bh);
-        if (de_bh)
-                brelse(de_bh);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
                iput(orphan_dir_inode);
        }
-        if (orphan_dir_bh)
+        brelse(orphan_dir_bh);
-                brelse(orphan_dir_bh);
        mlog_exit(status);
        return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        status = ocfs2_read_block(osb,
+        status = ocfs2_read_block(orphan_dir_inode,
                                  OCFS2_I(orphan_dir_inode)->ip_blkno,
-                                  &orphan_dir_bh, OCFS2_BH_CACHED,
+                                  &orphan_dir_bh);
-                                  orphan_dir_inode);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 leave:
-        if (orphan_dir_bh)
+        brelse(orphan_dir_bh);
-                brelse(orphan_dir_bh);
        mlog_exit(status);
        return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        }
 leave:
-        if (target_de_bh)
+        brelse(target_de_bh);
-                brelse(target_de_bh);
        mlog_exit(status);
        return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
        .setattr        = ocfs2_setattr,
        .getattr        = ocfs2_getattr,
        .permission     = ocfs2_permission,
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ocfs2_listxattr,
+        .removexattr    = generic_removexattr,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7f625f2b1117..a21a465490c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
@@ -171,9 +176,13 @@ struct ocfs2_alloc_stats
 enum ocfs2_local_alloc_state
 {
-        OCFS2_LA_UNUSED = 0,
+        OCFS2_LA_UNUSED = 0,    /* Local alloc will never be used for
-        OCFS2_LA_ENABLED,
+                                 * this mountpoint. */
-        OCFS2_LA_DISABLED
+        OCFS2_LA_ENABLED,       /* Local alloc is in use. */
+        OCFS2_LA_THROTTLED,     /* Local alloc is in use, but number
+                                 * of bits has been reduced. */
+        OCFS2_LA_DISABLED       /* Local alloc has temporarily been
+                                 * disabled. */
 };
 enum ocfs2_mount_options
@@ -184,6 +193,8 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
        OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -214,6 +225,7 @@ struct ocfs2_super
        u32 bitmap_cpg;
        u8 *uuid;
        char *uuid_str;
+        u32 uuid_hash;
        u8 *vol_label;
        u64 first_cluster_group_blkno;
        u32 fs_generation;
@@ -241,6 +253,7 @@ struct ocfs2_super
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
+        unsigned int s_xattr_inline_size;
        atomic_t vol_state;
        struct mutex recovery_lock;
@@ -252,11 +265,27 @@ struct ocfs2_super
        struct ocfs2_journal *journal;
        unsigned long osb_commit_interval;
-        int local_alloc_size;
+        struct delayed_work             la_enable_wq;
-        enum ocfs2_local_alloc_state local_alloc_state;
+        /*
+         * Must hold local alloc i_mutex and osb->osb_lock to change
+         * local_alloc_bits. Reads can be done under either lock.
+         */
+        unsigned int local_alloc_bits;
+        unsigned int local_alloc_default_bits;
+        enum ocfs2_local_alloc_state local_alloc_state; /* protected
+                                                         * by osb_lock */
        struct buffer_head *local_alloc_bh;
        u64 la_last_gd;
+#ifdef CONFIG_OCFS2_FS_STATS
+        struct dentry *local_alloc_debug;
+        char *local_alloc_debug_buf;
+#endif
        /* Next two fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
        return pages_per_cluster;
 }
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+                                                       unsigned int megs)
+{
+        BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+        return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
 static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 {
        spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4f619850ccf7..f24ce3d3f956 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
 #define OCFS2_INODE_SIGNATURE           "INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE    "EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -90,7 +91,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
-                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+                                         | OCFS2_FEATURE_INCOMPAT_XATTR)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 /*
@@ -127,10 +129,6 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA      0x0040
-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
 /*
 * Support for alternate, userspace cluster stacks.  If set, the superblock
 * field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
 */
 #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK  0x0080
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
 */
 #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE  8
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
 struct ocfs2_system_inode_info {
        char    *si_name;
        int     si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
 /*40*/  __le16 s_max_slots;             /* Max number of simultaneous mounts
                                           before tunefs required */
        __le16 s_tunefs_flag;
-        __le32 s_reserved1;
+        __le32 s_uuid_hash;             /* hash value of uuid */
        __le64 s_first_cluster_group;   /* Block offset of 1st cluster
                                         * group header */
 /*50*/  __u8  s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
 /*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
                                                     stack.  Only valid
                                                     with INCOMPAT flag. */
-/*B8*/  __le64 s_reserved2[17];         /* Fill out superblock */
+/*B8*/  __le16 s_xattr_inline_size;     /* extended attribute inline size
+                                           for this fs*/
+        __le16 s_reserved0;
+        __le32 s_reserved1;
+/*C0*/  __le64 s_reserved2[16];         /* Fill out superblock */
 /*140*/
        /*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
                                           belongs to */
        __le16 i_suballoc_bit;          /* Bit offset in suballocator
                                           block group */
-/*10*/  __le32 i_reserved0;
+/*10*/  __le16 i_reserved0;
+        __le16 i_xattr_inline_size;
        __le32 i_clusters;              /* Cluster count */
        __le32 i_uid;                   /* Owner UID */
        __le32 i_gid;                   /* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
        __le32 i_atime_nsec;
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
-        __le32 i_attr;
+/*70*/  __le32 i_attr;
        __le16 i_orphaned_slot;         /* Only valid when OCFS2_ORPHANED_FL
                                           was set in i_flags */
        __le16 i_dyn_features;
-/*70*/  __le64 i_reserved2[8];
+        __le64 i_xattr_loc;
+/*80*/  __le64 i_reserved2[7];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -715,6 +731,136 @@ struct ocfs2_group_desc
 /*40*/  __u8    bg_bitmap[0];
 };
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+        __le32  xe_name_hash;    /* hash value of xattr prefix+suffix. */
+        __le16  xe_name_offset;  /* byte offset from the 1st etnry in the local
+                                    local xattr storage(inode, xattr block or
+                                    xattr bucket). */
+        __u8    xe_name_len;     /* xattr name len, does't include prefix. */
+        __u8    xe_type;         /* the low 7 bits indicates the name prefix's
+                                  * type and the highest 1 bits indicate whether
+                                  * the EA is stored in the local storage. */
+        __le64  xe_value_size;   /* real xattr value length. */
+};
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+        __le16  xh_count;                       /* contains the count of how
+                                                   many records are in the
+                                                   local xattr storage. */
+        __le16  xh_free_start;                  /* current offset for storing
+                                                   xattr. */
+        __le16  xh_name_value_len;              /* total length of name/value
+                                                   length in this bucket. */
+        __le16  xh_num_buckets;                 /* bucket nums in one extent
+                                                   record, only valid in the
+                                                   first bucket. */
+        __le64  xh_csum;
+        struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/  __le32  xr_clusters;              /* clusters covered by xattr value. */
+        __le32  xr_reserved0;
+        __le64  xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/  struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/  __le32  xt_clusters;              /* clusters covered by xattr. */
+        __le32  xt_reserved0;
+        __le64  xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/  struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+#define OCFS2_XATTR_INDEXED     0x1
+#define OCFS2_HASH_SHIFT        5
+#define OCFS2_XATTR_ROUND       3
+#define OCFS2_XATTR_SIZE(size)  (((size) + OCFS2_XATTR_ROUND) & \
+                                ~(OCFS2_XATTR_ROUND))
+#define OCFS2_XATTR_BUCKET_SIZE                 4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET       (OCFS2_XATTR_BUCKET_SIZE \
+                                                 / OCFS2_MIN_BLOCKSIZE)
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/  __u8    xb_signature[8];     /* Signature for verification */
+        __le16  xb_suballoc_slot;    /* Slot suballocator this
+                                        block belongs to. */
+        __le16  xb_suballoc_bit;     /* Bit offset in suballocator
+                                        block group */
+        __le32  xb_fs_generation;    /* Must match super block */
+/*10*/  __le64  xb_blkno;            /* Offset on disk, in blocks */
+        __le64  xb_csum;
+/*20*/  __le16  xb_flags;            /* Indicates whether this block contains
+                                        real xattr or a xattr tree. */
+        __le16  xb_reserved0;
+        __le32  xb_reserved1;
+        __le64  xb_reserved2;
+/*30*/  union {
+                struct ocfs2_xattr_header xb_header; /* xattr header if this
+                                                        block contains xattr */
+                struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+                                                        block cotains xattr
+                                                        tree. */
+        } xb_attrs;
+};
+#define OCFS2_XATTR_ENTRY_LOCAL         0x80
+#define OCFS2_XATTR_TYPE_MASK           0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+                                         int local)
+{
+        if (local)
+                xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+        else
+                xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+        return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+        xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+        return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
@@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
                offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+                                                   struct ocfs2_dinode *di)
+{
+        unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+        if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+                return sb->s_blocksize -
+                        offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+                        xattrsize;
+        else
+                return sb->s_blocksize -
+                        offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
        int size;
@@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+                                                struct super_block *sb,
+                                                struct ocfs2_dinode *di)
+{
+        int size;
+        unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+        if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+                size = sb->s_blocksize -
+                        offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+                        xattrsize;
+        else
+                size = sb->s_blocksize -
+                        offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
        int size;
@@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
        return 0;
 }
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+        int size;
+        size = sb->s_blocksize -
+                offsetof(struct ocfs2_xattr_block,
+                         xb_attrs.xb_root.xt_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
@@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
        return 0;
 }
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+        int size;
+        size = blocksize -
+                offsetof(struct ocfs2_xattr_block,
+                         xb_attrs.xb_root.xt_list.l_recs);
+        return size / sizeof(struct ocfs2_extent_rec);
+}
 #endif  /* __KERNEL__ */
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 000000000000..b91c78f8f558
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+struct jbd2_inode {
+        unsigned int dummy;
+};
+#define JBD2_BARRIER                    JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE     JBD_DEFAULT_MAX_COMMIT_AGE
+#define jbd2_journal_ack_err                    journal_ack_err
+#define jbd2_journal_clear_err                  journal_clear_err
+#define jbd2_journal_destroy                    journal_destroy
+#define jbd2_journal_dirty_metadata             journal_dirty_metadata
+#define jbd2_journal_errno                      journal_errno
+#define jbd2_journal_extend                     journal_extend
+#define jbd2_journal_flush                      journal_flush
+#define jbd2_journal_force_commit               journal_force_commit
+#define jbd2_journal_get_write_access           journal_get_write_access
+#define jbd2_journal_get_undo_access            journal_get_undo_access
+#define jbd2_journal_init_inode                 journal_init_inode
+#define jbd2_journal_invalidatepage             journal_invalidatepage
+#define jbd2_journal_load                       journal_load
+#define jbd2_journal_lock_updates               journal_lock_updates
+#define jbd2_journal_restart                    journal_restart
+#define jbd2_journal_start                      journal_start
+#define jbd2_journal_start_commit               journal_start_commit
+#define jbd2_journal_stop                       journal_stop
+#define jbd2_journal_try_to_free_buffers        journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates             journal_unlock_updates
+#define jbd2_journal_wipe                       journal_wipe
+#define jbd2_log_wait_commit                    log_wait_commit
+static inline int jbd2_journal_file_inode(handle_t *handle,
+                                          struct jbd2_inode *inode)
+{
+        return 0;
+}
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+                                                      loff_t new_size)
+{
+        return 0;
+}
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+                                               struct inode *inode)
+{
+        return;
+}
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+                                                  struct jbd2_inode *jinode)
+{
+        return;
+}
+#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e9015..ffd48db229a7 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
                if (cluster > clusters)
                        break;
-                ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+                ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
                if (ret < 0) {
                        mlog_errno(ret);
                        break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
         * update the superblock last.
         * It doesn't matter if the write failed.
         */
-        ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
+        ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
-                               &super_bh, 0, NULL);
+                                     &super_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
                                              first_new_cluster - 1);
-        ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
+        ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
-                               main_bm_inode);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out_unlock;
        }
-        ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+        ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
        if (ret < 0) {
                mlog(ML_ERROR, "Can't read the group descriptor # %llu "
                     "from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf1..bdda2d8f8508 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
         * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
         * this is not true, the read of -1 (UINT64_MAX) will fail.
         */
-        ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+        ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-                                si->si_inode);
+                                OCFS2_BH_IGNORE_CACHE);
        if (ret == 0) {
                spin_lock(&osb->osb_lock);
                ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
                     (unsigned long long)blkno);
                bh = NULL;  /* Acquire a fresh bh */
-                status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+                status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+                                           OCFS2_BH_IGNORE_CACHE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c6748..faec2d879357 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
 #include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
+#include <linux/dlm_plock.h>
 /*
 * The control protocol starts with a handshake.  Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
 }
+static int user_plock(struct ocfs2_cluster_connection *conn,
+                      u64 ino,
+                      struct file *file,
+                      int cmd,
+                      struct file_lock *fl)
+{
+        /*
+         * This more or less just demuxes the plock request into any
+         * one of three dlm calls.
+         *
+         * Internally, fs/dlm will pass these to a misc device, which
+         * a userspace daemon will read and write to.
+         *
+         * For now, cancel requests (which happen internally only),
+         * are turned into unlocks. Most of this function taken from
+         * gfs2_lock.
+         */
+        if (cmd == F_CANCELLK) {
+                cmd = F_SETLK;
+                fl->fl_type = F_UNLCK;
+        }
+        if (IS_GETLK(cmd))
+                return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+        else if (fl->fl_type == F_UNLCK)
+                return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+        else
+                return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
 /*
 * Compare a requested locking protocol version against the current one.
 *
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
        .dlm_unlock     = user_dlm_unlock,
        .lock_status    = user_dlm_lock_status,
        .lock_lvb       = user_dlm_lvb,
+        .plock          = user_plock,
        .dump_lksb      = user_dlm_dump_lksb,
 };
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3a..68b668b0e60a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
                goto out;
        }
-        /* Ok, the stack is pinned */
-        p->sp_count++;
        active_stack = p;
        rc = 0;
 out:
+        /* If we found it, pin it */
+        if (!rc)
+                active_stack->sp_count++;
        spin_unlock(&ocfs2_stack_lock);
        return rc;
 }
@@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+int ocfs2_stack_supports_plocks(void)
+{
+        return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+                struct file *file, int cmd, struct file_lock *fl)
+{
+        WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+        if (active_stack->sp_ops->plock)
+                return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+        return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
 int ocfs2_cluster_connect(const char *stack_name,
                          const char *group,
                          int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1be..c571af375ef8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
 #include "dlm/dlmapi.h"
 #include <linux/dlm.h>
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
 /*
 * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
 * some day, but right now we need it.  Let's fake it.  This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
        void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
        /*
+         * Cluster-aware posix locks
+         *
+         * This is NULL for stacks which do not support posix locks.
+         */
+        int (*plock)(struct ocfs2_cluster_connection *conn,
+                     u64 ino,
+                     struct file *file,
+                     int cmd,
+                     struct file_lock *fl);
+        /*
         * This is an optoinal debugging hook.  If provided, the
         * stack can dump debugging information about this lock.
         */
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+                struct file *file, int cmd, struct file_lock *fl);
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb9819..c5ff18b46b57 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
                                  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
-                                   struct buffer_head *bh);
+                                   struct buffer_head *bh,
+                                   u64 max_block);
 static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
+                                      u64 max_block,
                                      u16 *bit_off, u16 *bits_found);
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
+                                    u64 max_block,
                                    u16 *bit_off, u16 *bits_found);
 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
                                     struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u64 data_blkno,
                                                u64 *bg_blkno,
                                                u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+                                             u32 bits_wanted, u64 max_block,
+                                             struct ocfs2_alloc_context **ac);
-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
        struct inode *inode = ac->ac_inode;
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
                iput(inode);
                ac->ac_inode = NULL;
        }
-        if (ac->ac_bh) {
+        brelse(ac->ac_bh);
-                brelse(ac->ac_bh);
+        ac->ac_bh = NULL;
-                ac->ac_bh = NULL;
-        }
 }
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 */
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
-                                   struct buffer_head *bh)
+                                   struct buffer_head *bh,
+                                   u64 max_block)
 {
        int status, credits;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        mlog_entry_void();
        cl = &fe->id2.i_chain;
-        status = ocfs2_reserve_clusters(osb,
+        status = ocfs2_reserve_clusters_with_limit(osb,
-                                        le16_to_cpu(cl->cl_cpg),
+                                                   le16_to_cpu(cl->cl_cpg),
-                                        &ac);
+                                                   max_block, &ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
        if (ac)
                ocfs2_free_alloc_context(ac);
-        if (bg_bh)
+        brelse(bg_bh);
-                brelse(bg_bh);
        mlog_exit(status);
        return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
-                status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+                                                 ac->ac_max_block);
                if (status < 0) {
                        if (status != -ENOSPC)
                                mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        get_bh(bh);
        ac->ac_bh = bh;
 bail:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        mlog_exit(status);
        return status;
 }
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
-                               struct ocfs2_dinode *fe,
+                                      int blocks,
-                               struct ocfs2_alloc_context **ac)
+                                      struct ocfs2_alloc_context **ac)
 {
        int status;
        u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
                goto bail;
        }
-        (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+        (*ac)->ac_bits_wanted = blocks;
        (*ac)->ac_which = OCFS2_AC_USE_META;
        slot = osb->slot_num;
        (*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
        return status;
 }
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+                               struct ocfs2_extent_list *root_el,
+                               struct ocfs2_alloc_context **ac)
+{
+        return ocfs2_reserve_new_metadata_blocks(osb,
+                                        ocfs2_extend_meta_needed(root_el),
+                                        ac);
+}
 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
                                              struct ocfs2_alloc_context *ac)
 {
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
        (*ac)->ac_group_search = ocfs2_block_group_search;
        /*
+         * stat(2) can't handle i_ino > 32bits, so we tell the
+         * lower levels not to allocate us a block group past that
+         * limit.  The 'inode64' mount option avoids this behavior.
+         */
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+                (*ac)->ac_max_block = (u32)~0U;
+        /*
         * slot is set when we successfully steal inode from other nodes.
         * It is reset in 3 places:
         * 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
 /* Callers don't need to care which bitmap (local alloc or main) to
 * use so we figure it out for them, but unfortunately this clutters
 * things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
-                           u32 bits_wanted,
+                                             u32 bits_wanted, u64 max_block,
-                           struct ocfs2_alloc_context **ac)
+                                             struct ocfs2_alloc_context **ac)
 {
        int status;
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
        }
        (*ac)->ac_bits_wanted = bits_wanted;
+        (*ac)->ac_max_block = max_block;
        status = -ENOSPC;
        if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
-                if ((status < 0) && (status != -ENOSPC)) {
+                if (status == -EFBIG) {
+                        /* The local alloc window is outside ac_max_block.
+                         * use the main bitmap. */
+                        status = -ENOSPC;
+                } else if ((status < 0) && (status != -ENOSPC)) {
                        mlog_errno(status);
                        goto bail;
-                } else if (status == -ENOSPC) {
-                        /* reserve_local_bits will return enospc with
-                         * the local alloc inode still locked, so we
-                         * can change this safely here. */
-                        mlog(0, "Disabling local alloc\n");
-                        /* We set to OCFS2_LA_DISABLED so that umount
-                         * can clean up what's left of the local
-                         * allocation */
-                        osb->local_alloc_state = OCFS2_LA_DISABLED;
                }
        }
@@ -718,6 +735,13 @@ bail:
        return status;
 }
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+                           u32 bits_wanted,
+                           struct ocfs2_alloc_context **ac)
+{
+        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
 /*
 * More or less lifted from ext3. I'll leave their description below:
 *
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
                                      u32 bits_wanted, u32 min_bits,
+                                      u64 max_block,
                                      u16 *bit_off, u16 *bits_found)
 {
        int search = -ENOSPC;
        int ret;
+        u64 blkoff;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u16 tmp_off, tmp_found;
        unsigned int max_bits, gd_cluster_off;
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                if (ret)
                        return ret;
+                if (max_block) {
+                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+                                                          gd_cluster_off +
+                                                          tmp_off + tmp_found);
+                        mlog(0, "Checking %llu against %llu\n",
+                             (unsigned long long)blkoff,
+                             (unsigned long long)max_block);
+                        if (blkoff > max_block)
+                                return -ENOSPC;
+                }
                /* ocfs2_block_group_find_clear_bits() might
                 * return success, but we still want to return
                 * -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                        *bit_off = tmp_off;
                        *bits_found = tmp_found;
                        search = 0; /* success */
+                } else if (tmp_found) {
+                        /*
+                         * Don't show bits which we'll be returning
+                         * for allocation to the local alloc bitmap.
+                         */
+                        ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
                }
        }
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 static int ocfs2_block_group_search(struct inode *inode,
                                    struct buffer_head *group_bh,
                                    u32 bits_wanted, u32 min_bits,
+                                    u64 max_block,
                                    u16 *bit_off, u16 *bits_found)
 {
        int ret = -ENOSPC;
+        u64 blkoff;
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
        BUG_ON(min_bits != 1);
        BUG_ON(ocfs2_is_cluster_bitmap(inode));
-        if (bg->bg_free_bits_count)
+        if (bg->bg_free_bits_count) {
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
                                                        group_bh, bits_wanted,
                                                        le16_to_cpu(bg->bg_bits),
                                                        bit_off, bits_found);
+                if (!ret && max_block) {
+                        blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+                                *bits_found;
+                        mlog(0, "Checking %llu against %llu\n",
+                             (unsigned long long)blkoff,
+                             (unsigned long long)max_block);
+                        if (blkoff > max_block)
+                                ret = -ENOSPC;
+                }
+        }
        return ret;
 }
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        struct ocfs2_group_desc *gd;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
+        ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
-                               &group_bh, OCFS2_BH_CACHED, alloc_inode);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        }
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-                                  bit_off, &found);
+                                  ac->ac_max_block, bit_off, &found);
        if (ret < 0) {
                if (ret != -ENOSPC)
                        mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
             bits_wanted, chain,
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
-        status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+        status = ocfs2_read_block(alloc_inode,
                                  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-                                  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+                                  &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        /* for now, the chain search is a bit simplistic. We just use
         * the 1st group with any empty bits. */
        while ((status = ac->ac_group_search(alloc_inode, group_bh,
-                                             bits_wanted, min_bits, bit_off,
+                                             bits_wanted, min_bits,
+                                             ac->ac_max_block, bit_off,
                                             &tmp_bits)) == -ENOSPC) {
                if (!bg->bg_next_group)
                        break;
-                if (prev_group_bh) {
+                brelse(prev_group_bh);
-                        brelse(prev_group_bh);
+                prev_group_bh = NULL;
-                        prev_group_bh = NULL;
-                }
                next_group = le64_to_cpu(bg->bg_next_group);
                prev_group_bh = group_bh;
                group_bh = NULL;
-                status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+                status = ocfs2_read_block(alloc_inode,
-                                          next_group, &group_bh,
+                                          next_group, &group_bh);
-                                          OCFS2_BH_CACHED, alloc_inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        *bg_blkno = le64_to_cpu(bg->bg_blkno);
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
-        if (group_bh)
+        brelse(group_bh);
-                brelse(group_bh);
+        brelse(prev_group_bh);
-        if (prev_group_bh)
-                brelse(prev_group_bh);
        mlog_exit(status);
        return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 {
        int status = 0;
        u32 tmp_used;
-        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
        struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
             (unsigned long long)bg_blkno, start_bit);
-        status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
+        status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
-                                  alloc_inode);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        }
 bail:
-        if (group_bh)
+        brelse(group_bh);
-                brelse(group_bh);
        mlog_exit(status);
        return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
        status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
                                          bg_start_bit, bg_blkno,
                                          num_clusters);
-        if (status < 0)
+        if (status < 0) {
                mlog_errno(status);
+                goto out;
+        }
+        ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+                                         num_clusters);
+out:
        mlog_exit(status);
        return status;
 }
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
                       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
        }
 }
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+                          struct ocfs2_extent_tree *et,
+                          u32 clusters_to_add, u32 extents_to_split,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac)
+{
+        int ret = 0, num_free_extents;
+        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        *meta_ac = NULL;
+        if (data_ac)
+                *data_ac = NULL;
+        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+        num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+        if (num_free_extents < 0) {
+                ret = num_free_extents;
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * Sparse allocation file systems need to be more conservative
+         * with reserving room for expansion - the actual allocation
+         * happens while we've got a journal handle open so re-taking
+         * a cluster lock (because we ran out of room for another
+         * extent) will violate ordering rules.
+         *
+         * Most of the time we'll only be seeing this 1 cluster at a time
+         * anyway.
+         *
+         * Always lock for any unwritten extents - we might want to
+         * add blocks during a split.
+         */
+        if (!num_free_extents ||
+            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+                ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+                if (ret < 0) {
+                        if (ret != -ENOSPC)
+                                mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (clusters_to_add == 0)
+                goto out;
+        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto out;
+        }
+out:
+        if (ret) {
+                if (*meta_ac) {
+                        ocfs2_free_alloc_context(*meta_ac);
+                        *meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null *data_ac.
+                 */
+        }
+        return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662bd..4df159d8f450 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
 typedef int (group_search_t)(struct inode *,
                             struct buffer_head *,
-                             u32,
+                             u32,                       /* bits_wanted */
-                             u32,
+                             u32,                       /* min_bits */
-                             u16 *,
+                             u64,                       /* max_block */
-                             u16 *);
+                             u16 *,                     /* *bit_off */
+                             u16 *);                    /* *bits_found */
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
        group_search_t *ac_group_search;
        u64    ac_last_group;
+        u64    ac_max_block;  /* Highest block number to allocate. 0 is
+                                 is the same as ~0 - unlimited */
 };
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
        return ac->ac_bits_wanted - ac->ac_bits_given;
 }
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-                               struct ocfs2_dinode *fe,
+                               struct ocfs2_extent_list *root_el,
                               struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+                                      int blocks,
+                                      struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                            struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
 * apis above. */
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
                                      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 /* given a cluster offset, calculate which block group it belongs to
 * and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
                                 struct ocfs2_dinode *di,
                                 struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+                          u32 clusters_to_add, u32 extents_to_split,
+                          struct ocfs2_alloc_context **data_ac,
+                          struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 88255d3f52b4..304b63ac78cf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -154,10 +155,13 @@ enum {
        Opt_localalloc,
        Opt_localflocks,
        Opt_stack,
+        Opt_user_xattr,
+        Opt_nouser_xattr,
+        Opt_inode64,
        Opt_err,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_barrier, "barrier=%u"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
@@ -173,6 +177,9 @@ static match_table_t tokens = {
        {Opt_localalloc, "localalloc=%d"},
        {Opt_localflocks, "localflocks"},
        {Opt_stack, "cluster_stack=%s"},
+        {Opt_user_xattr, "user_xattr"},
+        {Opt_nouser_xattr, "nouser_xattr"},
+        {Opt_inode64, "inode64"},
        {Opt_err, NULL}
 };
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
                ocfs2_schedule_truncate_log_flush(osb, 0);
        }
-        if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+        if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+                                      &target)) {
                if (wait)
-                        log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+                        jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
-                                        target);
+                                             target);
        }
        return 0;
 }
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
        if (!oi)
                return NULL;
+        jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
        return &oi->vfs_inode;
 }
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                goto out;
        }
+        /* Probably don't want this on remount; it might
+         * mess with other nodes */
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+            (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+                ret = -EINVAL;
+                mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+                goto out;
+        }
        /* We're going to/from readonly mode. */
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
                /* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
-        osb->local_alloc_size = parsed_options.localalloc_opt;
+        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+        osb->local_alloc_bits = osb->local_alloc_default_bits;
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        return status;
 read_super_error:
-        if (bh != NULL)
+        brelse(bh);
-                brelse(bh);
        if (inode)
                iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_data_writeback:
                        mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
                        break;
+                case Opt_user_xattr:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+                        break;
+                case Opt_nouser_xattr:
+                        mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+                        break;
                case Opt_atime_quantum:
                        if (match_int(&args[0], &option)) {
                                status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
                        if (option < 0)
                                return 0;
                        if (option == 0)
-                                option = JBD_DEFAULT_MAX_COMMIT_AGE;
+                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
                        mopt->commit_interval = HZ * option;
                        break;
                case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
                               OCFS2_STACK_LABEL_LEN);
                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
                        break;
+                case Opt_inode64:
+                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
        struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
        unsigned long opts = osb->s_mount_opt;
+        unsigned int local_alloc_megs;
        if (opts & OCFS2_MOUNT_HB_LOCAL)
                seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",commit=%u",
                           (unsigned) (osb->osb_commit_interval / HZ));
-        if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+        local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
-                seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+        if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+                seq_printf(s, ",localalloc=%d", local_alloc_megs);
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
                seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
                           osb->osb_cluster_stack);
+        if (opts & OCFS2_MOUNT_NOUSERXATTR)
+                seq_printf(s, ",nouser_xattr");
+        else
+                seq_printf(s, ",user_xattr");
+        if (opts & OCFS2_MOUNT_INODE64)
+                seq_printf(s, ",inode64");
        return 0;
 }
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data)
        oi->ip_dir_start_lookup = 0;
        init_rwsem(&oi->ip_alloc_sem);
+        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);
        oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
+        sb->s_xattr = ocfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
        /* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->slot_num = OCFS2_INVALID_SLOT;
+        osb->s_xattr_inline_size = le16_to_cpu(
+                                        di->id2.i_super.s_xattr_inline_size);
        osb->local_alloc_state = OCFS2_LA_UNUSED;
        osb->local_alloc_bh = NULL;
+        INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
        init_waitqueue_head(&osb->osb_mount_event);
@@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->first_cluster_group_blkno =
                le64_to_cpu(di->id2.i_super.s_first_cluster_group);
        osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+        osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
        mlog(0, "vol_label: %s\n", osb->vol_label);
        mlog(0, "uuid: %s\n", osb->uuid_str);
        mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25b..cbd03dfdc7b9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "symlink.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        mlog_entry_void();
-        status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
-                                  OCFS2_I(inode)->ip_blkno,
-                                  bh,
-                                  OCFS2_BH_CACHED,
-                                  inode);
        if (status < 0) {
                mlog_errno(status);
                link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
                kunmap(page);
                page_cache_release(page);
        }
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        return ERR_PTR(status);
 }
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
        .follow_link    = ocfs2_follow_link,
        .getattr        = ocfs2_getattr,
        .setattr        = ocfs2_setattr,
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ocfs2_listxattr,
+        .removexattr    = generic_removexattr,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
        .readlink       = ocfs2_readlink,
        .follow_link    = ocfs2_follow_link,
        .getattr        = ocfs2_getattr,
        .setattr        = ocfs2_setattr,
+        .setxattr       = generic_setxattr,
+        .getxattr       = generic_getxattr,
+        .listxattr      = ocfs2_listxattr,
+        .removexattr    = generic_removexattr,
 };
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b23..187b99ff0368 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif
 #define MLOG_MASK_PREFIX ML_UPTODATE
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
        ci->ci_num_cached--;
 }
-/* Called when we remove a chunk of metadata from an inode. We don't
+static void ocfs2_remove_block_from_cache(struct inode *inode,
- * bother reverting things to an inlined array in the case of a remove
+                                          sector_t block)
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
-                             struct buffer_head *bh)
 {
        int index;
-        sector_t block = bh->b_blocknr;
        struct ocfs2_meta_cache_item *item = NULL;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
                kmem_cache_free(ocfs2_uptodate_cachep, item);
 }
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+                             struct buffer_head *bh)
+{
+        sector_t block = bh->b_blocknr;
+        ocfs2_remove_block_from_cache(inode, block);
+}
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+                                            sector_t block,
+                                            u32 c_len)
+{
+        unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+        for (i = 0; i < b_len; i++, block++)
+                ocfs2_remove_block_from_cache(inode, block);
+}
 int __init init_ocfs2_uptodate_cache(void)
 {
        ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a8..531b4b3a0c47 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
                                   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
                             struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+                                            sector_t block,
+                                            u32 c_len);
 int ocfs2_buffer_read_ahead(struct inode *inode,
                            struct buffer_head *bh);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 000000000000..c25780a70dfd
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4834 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#define MLOG_MASK_PREFIX ML_XATTR
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+struct ocfs2_xattr_def_value_root {
+        struct ocfs2_xattr_value_root   xv;
+        struct ocfs2_extent_rec         er;
+};
+struct ocfs2_xattr_bucket {
+        struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        struct ocfs2_xattr_header *xh;
+};
+#define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE 80
+static struct ocfs2_xattr_def_value_root def_xv = {
+        .xv.xr_list.l_count = cpu_to_le16(1),
+};
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+        &ocfs2_xattr_user_handler,
+        &ocfs2_xattr_trusted_handler,
+        NULL
+};
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
+        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
+};
+struct ocfs2_xattr_info {
+        int name_index;
+        const char *name;
+        const void *value;
+        size_t value_len;
+};
+struct ocfs2_xattr_search {
+        struct buffer_head *inode_bh;
+        /*
+         * xattr_bh point to the block buffer head which has extended attribute
+         * when extended attribute in inode, xattr_bh is equal to inode_bh.
+         */
+        struct buffer_head *xattr_bh;
+        struct ocfs2_xattr_header *header;
+        struct ocfs2_xattr_bucket bucket;
+        void *base;
+        void *end;
+        struct ocfs2_xattr_entry *here;
+        int not_found;
+};
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+                                             struct ocfs2_xattr_header *xh,
+                                             int index,
+                                             int *block_off,
+                                             int *new_offset);
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+                                        struct buffer_head *root_bh,
+                                        int name_index,
+                                        const char *name,
+                                        struct ocfs2_xattr_search *xs);
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+                                        struct ocfs2_xattr_tree_root *xt,
+                                        char *buffer,
+                                        size_t buffer_size);
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+                                          struct ocfs2_xattr_search *xs);
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+                                             struct ocfs2_xattr_info *xi,
+                                             struct ocfs2_xattr_search *xs);
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+                                          struct buffer_head *xb_bh);
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+        struct xattr_handler *handler = NULL;
+        if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+                handler = ocfs2_xattr_handler_map[name_index];
+        return handler ? handler->prefix : NULL;
+}
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+                                 const char *name,
+                                 int name_len)
+{
+        /* Get hash value of uuid from super block */
+        u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+        int i;
+        /* hash extended attribute name */
+        for (i = 0; i < name_len; i++) {
+                hash = (hash << OCFS2_HASH_SHIFT) ^
+                       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+                       *name++;
+        }
+        return hash;
+}
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct inode *inode,
+                                   struct ocfs2_xattr_header *header,
+                                   struct ocfs2_xattr_entry *entry)
+{
+        u32 hash = 0;
+        char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+        hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
+        entry->xe_name_hash = cpu_to_le32(hash);
+        return;
+}
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+                                         u32 clusters_to_add,
+                                         struct buffer_head *xattr_bh,
+                                         struct ocfs2_xattr_value_root *xv)
+{
+        int status = 0;
+        int restart_func = 0;
+        int credits = 0;
+        handle_t *handle = NULL;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        enum ocfs2_alloc_restarted why;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+        struct ocfs2_extent_tree et;
+        mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+restart_all:
+        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+                                       &data_ac, &meta_ac);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
+        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+                                            clusters_to_add);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(status);
+                goto leave;
+        }
+restarted_transaction:
+        status = ocfs2_journal_access(handle, inode, xattr_bh,
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        prev_clusters = le32_to_cpu(xv->xr_clusters);
+        status = ocfs2_add_clusters_in_btree(osb,
+                                             inode,
+                                             &logical_start,
+                                             clusters_to_add,
+                                             0,
+                                             &et,
+                                             handle,
+                                             data_ac,
+                                             meta_ac,
+                                             &why);
+        if ((status < 0) && (status != -EAGAIN)) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
+        status = ocfs2_journal_dirty(handle, xattr_bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+        if (why != RESTART_NONE && clusters_to_add) {
+                if (why == RESTART_META) {
+                        mlog(0, "restarting function.\n");
+                        restart_func = 1;
+                } else {
+                        BUG_ON(why != RESTART_TRANS);
+                        mlog(0, "restarting transaction.\n");
+                        /* TODO: This can be more intelligent. */
+                        credits = ocfs2_calc_extend_credits(osb->sb,
+                                                            et.et_root_el,
+                                                            clusters_to_add);
+                        status = ocfs2_extend_trans(handle, credits);
+                        if (status < 0) {
+                                /* handle still has to be committed at
+                                 * this point. */
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                goto leave;
+                        }
+                        goto restarted_transaction;
+                }
+        }
+leave:
+        if (handle) {
+                ocfs2_commit_trans(osb, handle);
+                handle = NULL;
+        }
+        if (data_ac) {
+                ocfs2_free_alloc_context(data_ac);
+                data_ac = NULL;
+        }
+        if (meta_ac) {
+                ocfs2_free_alloc_context(meta_ac);
+                meta_ac = NULL;
+        }
+        if ((!status) && restart_func) {
+                restart_func = 0;
+                goto restart_all;
+        }
+        return status;
+}
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_root *xv,
+                                      u32 cpos, u32 phys_cpos, u32 len,
+                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        le32_add_cpu(&xv->xr_clusters, -len);
+        ret = ocfs2_journal_dirty(handle, root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+                                   u32 old_clusters,
+                                   u32 new_clusters,
+                                   struct buffer_head *root_bh,
+                                   struct ocfs2_xattr_value_root *xv)
+{
+        int ret = 0;
+        u32 trunc_len, cpos, phys_cpos, alloc_size;
+        u64 block;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        if (old_clusters <= new_clusters)
+                return 0;
+        cpos = new_clusters;
+        trunc_len = old_clusters - new_clusters;
+        while (trunc_len) {
+                ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+                                               &alloc_size, &xv->xr_list);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (alloc_size > trunc_len)
+                        alloc_size = trunc_len;
+                ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+                                                 phys_cpos, alloc_size,
+                                                 &dealloc);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+                ocfs2_remove_xattr_clusters_from_cache(inode, block,
+                                                       alloc_size);
+                cpos += alloc_size;
+                trunc_len -= alloc_size;
+        }
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &dealloc);
+        return ret;
+}
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_root *xv,
+                                      int len)
+{
+        int ret;
+        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+        u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+        if (new_clusters == old_clusters)
+                return 0;
+        if (new_clusters > old_clusters)
+                ret = ocfs2_xattr_extend_allocation(inode,
+                                                    new_clusters - old_clusters,
+                                                    root_bh, xv);
+        else
+                ret = ocfs2_xattr_shrink_size(inode,
+                                              old_clusters, new_clusters,
+                                              root_bh, xv);
+        return ret;
+}
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+                                  size_t *result, const char *prefix,
+                                  const char *name, int name_len)
+{
+        char *p = buffer + *result;
+        int prefix_len = strlen(prefix);
+        int total_len = prefix_len + name_len + 1;
+        *result += total_len;
+        /* we are just looking for how big our buffer needs to be */
+        if (!size)
+                return 0;
+        if (*result > size)
+                return -ERANGE;
+        memcpy(p, prefix, prefix_len);
+        memcpy(p + prefix_len, name, name_len);
+        p[prefix_len + name_len] = '\0';
+        return 0;
+}
+static int ocfs2_xattr_list_entries(struct inode *inode,
+                                    struct ocfs2_xattr_header *header,
+                                    char *buffer, size_t buffer_size)
+{
+        size_t result = 0;
+        int i, type, ret;
+        const char *prefix, *name;
+        for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+                struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+                type = ocfs2_xattr_get_type(entry);
+                prefix = ocfs2_xattr_prefix(type);
+                if (prefix) {
+                        name = (const char *)header +
+                                le16_to_cpu(entry->xe_name_offset);
+                        ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+                                                     &result, prefix, name,
+                                                     entry->xe_name_len);
+                        if (ret)
+                                return ret;
+                }
+        }
+        return result;
+}
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+                                  struct ocfs2_dinode *di,
+                                  char *buffer,
+                                  size_t buffer_size)
+{
+        struct ocfs2_xattr_header *header = NULL;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        int ret = 0;
+        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+                return ret;
+        header = (struct ocfs2_xattr_header *)
+                 ((void *)di + inode->i_sb->s_blocksize -
+                 le16_to_cpu(di->i_xattr_inline_size));
+        ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+        return ret;
+}
+static int ocfs2_xattr_block_list(struct inode *inode,
+                                  struct ocfs2_dinode *di,
+                                  char *buffer,
+                                  size_t buffer_size)
+{
+        struct buffer_head *blk_bh = NULL;
+        struct ocfs2_xattr_block *xb;
+        int ret = 0;
+        if (!di->i_xattr_loc)
+                return ret;
+        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*Verify the signature of xattr block*/
+        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+                ret = -EFAULT;
+                goto cleanup;
+        }
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+                struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+                ret = ocfs2_xattr_list_entries(inode, header,
+                                               buffer, buffer_size);
+        } else {
+                struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+                ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+                                                   buffer, buffer_size);
+        }
+cleanup:
+        brelse(blk_bh);
+        return ret;
+}
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+                        char *buffer,
+                        size_t size)
+{
+        int ret = 0, i_ret = 0, b_ret = 0;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
+        struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+        if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+                return -EOPNOTSUPP;
+        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+                return ret;
+        ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_read(&oi->ip_xattr_sem);
+        i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+        if (i_ret < 0)
+                b_ret = 0;
+        else {
+                if (buffer) {
+                        buffer += i_ret;
+                        size -= i_ret;
+                }
+                b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+                                               buffer, size);
+                if (b_ret < 0)
+                        i_ret = 0;
+        }
+        up_read(&oi->ip_xattr_sem);
+        ocfs2_inode_unlock(dentry->d_inode, 0);
+        brelse(di_bh);
+        return i_ret + b_ret;
+}
+static int ocfs2_xattr_find_entry(int name_index,
+                                  const char *name,
+                                  struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_xattr_entry *entry;
+        size_t name_len;
+        int i, cmp = 1;
+        if (name == NULL)
+                return -EINVAL;
+        name_len = strlen(name);
+        entry = xs->here;
+        for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+                cmp = name_index - ocfs2_xattr_get_type(entry);
+                if (!cmp)
+                        cmp = name_len - entry->xe_name_len;
+                if (!cmp)
+                        cmp = memcmp(name, (xs->base +
+                                     le16_to_cpu(entry->xe_name_offset)),
+                                     name_len);
+                if (cmp == 0)
+                        break;
+                entry += 1;
+        }
+        xs->here = entry;
+        return cmp ? -ENODATA : 0;
+}
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+                                         struct ocfs2_xattr_value_root *xv,
+                                         void *buffer,
+                                         size_t len)
+{
+        u32 cpos, p_cluster, num_clusters, bpc, clusters;
+        u64 blkno;
+        int i, ret = 0;
+        size_t cplen, blocksize;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_extent_list *el;
+        el = &xv->xr_list;
+        clusters = le32_to_cpu(xv->xr_clusters);
+        bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        blocksize = inode->i_sb->s_blocksize;
+        cpos = 0;
+        while (cpos < clusters) {
+                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+                                               &num_clusters, el);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+                /* Copy ocfs2_xattr_value */
+                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        cplen = len >= blocksize ? blocksize : len;
+                        memcpy(buffer, bh->b_data, cplen);
+                        len -= cplen;
+                        buffer += cplen;
+                        brelse(bh);
+                        bh = NULL;
+                        if (len == 0)
+                                break;
+                }
+                cpos += num_clusters;
+        }
+out:
+        return ret;
+}
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+                                 int name_index,
+                                 const char *name,
+                                 void *buffer,
+                                 size_t buffer_size,
+                                 struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        struct ocfs2_xattr_value_root *xv;
+        size_t size;
+        int ret = 0;
+        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+                return -ENODATA;
+        xs->end = (void *)di + inode->i_sb->s_blocksize;
+        xs->header = (struct ocfs2_xattr_header *)
+                        (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+        xs->base = (void *)xs->header;
+        xs->here = xs->header->xh_entries;
+        ret = ocfs2_xattr_find_entry(name_index, name, xs);
+        if (ret)
+                return ret;
+        size = le64_to_cpu(xs->here->xe_value_size);
+        if (buffer) {
+                if (size > buffer_size)
+                        return -ERANGE;
+                if (ocfs2_xattr_is_local(xs->here)) {
+                        memcpy(buffer, (void *)xs->base +
+                               le16_to_cpu(xs->here->xe_name_offset) +
+                               OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+                } else {
+                        xv = (struct ocfs2_xattr_value_root *)
+                                (xs->base + le16_to_cpu(
+                                 xs->here->xe_name_offset) +
+                                OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+                        ret = ocfs2_xattr_get_value_outside(inode, xv,
+                                                            buffer, size);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                return ret;
+                        }
+                }
+        }
+        return size;
+}
+static int ocfs2_xattr_block_get(struct inode *inode,
+                                 int name_index,
+                                 const char *name,
+                                 void *buffer,
+                                 size_t buffer_size,
+                                 struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        struct buffer_head *blk_bh = NULL;
+        struct ocfs2_xattr_block *xb;
+        struct ocfs2_xattr_value_root *xv;
+        size_t size;
+        int ret = -ENODATA, name_offset, name_len, block_off, i;
+        if (!di->i_xattr_loc)
+                return ret;
+        memset(&xs->bucket, 0, sizeof(xs->bucket));
+        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*Verify the signature of xattr block*/
+        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+                ret = -EFAULT;
+                goto cleanup;
+        }
+        xs->xattr_bh = blk_bh;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+                xs->header = &xb->xb_attrs.xb_header;
+                xs->base = (void *)xs->header;
+                xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+                xs->here = xs->header->xh_entries;
+                ret = ocfs2_xattr_find_entry(name_index, name, xs);
+        } else
+                ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+                                                   name_index,
+                                                   name, xs);
+        if (ret)
+                goto cleanup;
+        size = le64_to_cpu(xs->here->xe_value_size);
+        if (buffer) {
+                ret = -ERANGE;
+                if (size > buffer_size)
+                        goto cleanup;
+                name_offset = le16_to_cpu(xs->here->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+                i = xs->here - xs->header->xh_entries;
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                                xs->bucket.xh,
+                                                                i,
+                                                                &block_off,
+                                                                &name_offset);
+                        xs->base = xs->bucket.bhs[block_off]->b_data;
+                }
+                if (ocfs2_xattr_is_local(xs->here)) {
+                        memcpy(buffer, (void *)xs->base +
+                               name_offset + name_len, size);
+                } else {
+                        xv = (struct ocfs2_xattr_value_root *)
+                                (xs->base + name_offset + name_len);
+                        ret = ocfs2_xattr_get_value_outside(inode, xv,
+                                                            buffer, size);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto cleanup;
+                        }
+                }
+        }
+        ret = size;
+cleanup:
+        for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+                brelse(xs->bucket.bhs[i]);
+        memset(&xs->bucket, 0, sizeof(xs->bucket));
+        brelse(blk_bh);
+        return ret;
+}
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+int ocfs2_xattr_get(struct inode *inode,
+                    int name_index,
+                    const char *name,
+                    void *buffer,
+                    size_t buffer_size)
+{
+        int ret;
+        struct ocfs2_dinode *di = NULL;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_xattr_search xis = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_search xbs = {
+                .not_found = -ENODATA,
+        };
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return -EOPNOTSUPP;
+        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+                ret = -ENODATA;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        xis.inode_bh = xbs.inode_bh = di_bh;
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_read(&oi->ip_xattr_sem);
+        ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+                                    buffer_size, &xis);
+        if (ret == -ENODATA)
+                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+                                            buffer_size, &xbs);
+        up_read(&oi->ip_xattr_sem);
+        ocfs2_inode_unlock(inode, 0);
+        brelse(di_bh);
+        return ret;
+}
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+                                           struct ocfs2_xattr_value_root *xv,
+                                           const void *value,
+                                           int value_len)
+{
+        int ret = 0, i, cp_len, credits;
+        u16 blocksize = inode->i_sb->s_blocksize;
+        u32 p_cluster, num_clusters;
+        u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+        u64 blkno;
+        struct buffer_head *bh = NULL;
+        handle_t *handle;
+        BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+        credits = clusters * bpc;
+        handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        while (cpos < clusters) {
+                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+                                               &num_clusters, &xv->xr_list);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out_commit;
+                        }
+                        ret = ocfs2_journal_access(handle,
+                                                   inode,
+                                                   bh,
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto out_commit;
+                        }
+                        cp_len = value_len > blocksize ? blocksize : value_len;
+                        memcpy(bh->b_data, value, cp_len);
+                        value_len -= cp_len;
+                        value += cp_len;
+                        if (cp_len < blocksize)
+                                memset(bh->b_data + cp_len, 0,
+                                       blocksize - cp_len);
+                        ret = ocfs2_journal_dirty(handle, bh);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto out_commit;
+                        }
+                        brelse(bh);
+                        bh = NULL;
+                        /*
+                         * XXX: do we need to empty all the following
+                         * blocks in this cluster?
+                         */
+                        if (!value_len)
+                                break;
+                }
+                cpos += num_clusters;
+        }
+out_commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+        brelse(bh);
+        return ret;
+}
+static int ocfs2_xattr_cleanup(struct inode *inode,
+                               struct ocfs2_xattr_info *xi,
+                               struct ocfs2_xattr_search *xs,
+                               size_t offs)
+{
+        handle_t *handle = NULL;
+        int ret = 0;
+        size_t name_len = strlen(xi->name);
+        void *val = xs->base + offs;
+        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /* Decrease xattr count */
+        le16_add_cpu(&xs->header->xh_count, -1);
+        /* Remove the xattr entry and tree root which has already be set*/
+        memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
+        memset(val, 0, size);
+        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+        return ret;
+}
+static int ocfs2_xattr_update_entry(struct inode *inode,
+                                    struct ocfs2_xattr_info *xi,
+                                    struct ocfs2_xattr_search *xs,
+                                    size_t offs)
+{
+        handle_t *handle = NULL;
+        int ret = 0;
+        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        xs->here->xe_name_offset = cpu_to_le16(offs);
+        xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+        if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+                ocfs2_xattr_set_local(xs->here, 1);
+        else
+                ocfs2_xattr_set_local(xs->here, 0);
+        ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+        return ret;
+}
+/*
+ * ocfs2_xattr_set_value_outside()
+ *
+ * Set large size value in B tree.
+ */
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+                                         struct ocfs2_xattr_info *xi,
+                                         struct ocfs2_xattr_search *xs,
+                                         size_t offs)
+{
+        size_t name_len = strlen(xi->name);
+        void *val = xs->base + offs;
+        struct ocfs2_xattr_value_root *xv = NULL;
+        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        int ret = 0;
+        memset(val, 0, size);
+        memcpy(val, xi->name, name_len);
+        xv = (struct ocfs2_xattr_value_root *)
+                (val + OCFS2_XATTR_SIZE(name_len));
+        xv->xr_clusters = 0;
+        xv->xr_last_eb_blk = 0;
+        xv->xr_list.l_tree_depth = 0;
+        xv->xr_list.l_count = cpu_to_le16(1);
+        xv->xr_list.l_next_free_rec = 0;
+        ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+                                         xi->value_len);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+                                              xi->value_len);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+        if (ret < 0)
+                mlog_errno(ret);
+        return ret;
+}
+/*
+ * ocfs2_xattr_set_entry_local()
+ *
+ * Set, replace or remove extended attribute in local.
+ */
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+                                        struct ocfs2_xattr_info *xi,
+                                        struct ocfs2_xattr_search *xs,
+                                        struct ocfs2_xattr_entry *last,
+                                        size_t min_offs)
+{
+        size_t name_len = strlen(xi->name);
+        int i;
+        if (xi->value && xs->not_found) {
+                /* Insert the new xattr entry. */
+                le16_add_cpu(&xs->header->xh_count, 1);
+                ocfs2_xattr_set_type(last, xi->name_index);
+                ocfs2_xattr_set_local(last, 1);
+                last->xe_name_len = name_len;
+        } else {
+                void *first_val;
+                void *val;
+                size_t offs, size;
+                first_val = xs->base + min_offs;
+                offs = le16_to_cpu(xs->here->xe_name_offset);
+                val = xs->base + offs;
+                if (le64_to_cpu(xs->here->xe_value_size) >
+                    OCFS2_XATTR_INLINE_SIZE)
+                        size = OCFS2_XATTR_SIZE(name_len) +
+                                OCFS2_XATTR_ROOT_SIZE;
+                else
+                        size = OCFS2_XATTR_SIZE(name_len) +
+                        OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+                if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+                                OCFS2_XATTR_SIZE(xi->value_len)) {
+                        /* The old and the new value have the
+                           same size. Just replace the value. */
+                        ocfs2_xattr_set_local(xs->here, 1);
+                        xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+                        /* Clear value bytes. */
+                        memset(val + OCFS2_XATTR_SIZE(name_len),
+                               0,
+                               OCFS2_XATTR_SIZE(xi->value_len));
+                        memcpy(val + OCFS2_XATTR_SIZE(name_len),
+                               xi->value,
+                               xi->value_len);
+                        return;
+                }
+                /* Remove the old name+value. */
+                memmove(first_val + size, first_val, val - first_val);
+                memset(first_val, 0, size);
+                xs->here->xe_name_hash = 0;
+                xs->here->xe_name_offset = 0;
+                ocfs2_xattr_set_local(xs->here, 1);
+                xs->here->xe_value_size = 0;
+                min_offs += size;
+                /* Adjust all value offsets. */
+                last = xs->header->xh_entries;
+                for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+                        size_t o = le16_to_cpu(last->xe_name_offset);
+                        if (o < offs)
+                                last->xe_name_offset = cpu_to_le16(o + size);
+                        last += 1;
+                }
+                if (!xi->value) {
+                        /* Remove the old entry. */
+                        last -= 1;
+                        memmove(xs->here, xs->here + 1,
+                                (void *)last - (void *)xs->here);
+                        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+                        le16_add_cpu(&xs->header->xh_count, -1);
+                }
+        }
+        if (xi->value) {
+                /* Insert the new name+value. */
+                size_t size = OCFS2_XATTR_SIZE(name_len) +
+                                OCFS2_XATTR_SIZE(xi->value_len);
+                void *val = xs->base + min_offs - size;
+                xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+                memset(val, 0, size);
+                memcpy(val, xi->name, name_len);
+                memcpy(val + OCFS2_XATTR_SIZE(name_len),
+                       xi->value,
+                       xi->value_len);
+                xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+                ocfs2_xattr_set_local(xs->here, 1);
+                ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+        }
+        return;
+}
+/*
+ * ocfs2_xattr_set_entry()
+ *
+ * Set extended attribute entry into inode or block.
+ *
+ * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * then set value in B tree with set_value_outside().
+ */
+static int ocfs2_xattr_set_entry(struct inode *inode,
+                                 struct ocfs2_xattr_info *xi,
+                                 struct ocfs2_xattr_search *xs,
+                                 int flag)
+{
+        struct ocfs2_xattr_entry *last;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+        size_t size_l = 0;
+        handle_t *handle = NULL;
+        int free, i, ret;
+        struct ocfs2_xattr_info xi_l = {
+                .name_index = xi->name_index,
+                .name = xi->name,
+                .value = xi->value,
+                .value_len = xi->value_len,
+        };
+        /* Compute min_offs, last and free space. */
+        last = xs->header->xh_entries;
+        for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+                size_t offs = le16_to_cpu(last->xe_name_offset);
+                if (offs < min_offs)
+                        min_offs = offs;
+                last += 1;
+        }
+        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        if (free < 0)
+                return -EFAULT;
+        if (!xs->not_found) {
+                size_t size = 0;
+                if (ocfs2_xattr_is_local(xs->here))
+                        size = OCFS2_XATTR_SIZE(name_len) +
+                        OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+                else
+                        size = OCFS2_XATTR_SIZE(name_len) +
+                                OCFS2_XATTR_ROOT_SIZE;
+                free += (size + sizeof(struct ocfs2_xattr_entry));
+        }
+        /* Check free space in inode or block */
+        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                if (free < sizeof(struct ocfs2_xattr_entry) +
+                           OCFS2_XATTR_SIZE(name_len) +
+                           OCFS2_XATTR_ROOT_SIZE) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+                xi_l.value = (void *)&def_xv;
+                xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+        } else if (xi->value) {
+                if (free < sizeof(struct ocfs2_xattr_entry) +
+                           OCFS2_XATTR_SIZE(name_len) +
+                           OCFS2_XATTR_SIZE(xi->value_len)) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+        }
+        if (!xs->not_found) {
+                /* For existing extended attribute */
+                size_t size = OCFS2_XATTR_SIZE(name_len) +
+                        OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+                void *val = xs->base + offs;
+                if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+                        /* Replace existing local xattr with tree root */
+                        ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+                                                            offs);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                        goto out;
+                } else if (!ocfs2_xattr_is_local(xs->here)) {
+                        /* For existing xattr which has value outside */
+                        struct ocfs2_xattr_value_root *xv = NULL;
+                        xv = (struct ocfs2_xattr_value_root *)(val +
+                                OCFS2_XATTR_SIZE(name_len));
+                        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                                /*
+                                 * If new value need set outside also,
+                                 * first truncate old value to new value,
+                                 * then set new value with set_value_outside().
+                                 */
+                                ret = ocfs2_xattr_value_truncate(inode,
+                                                                 xs->xattr_bh,
+                                                                 xv,
+                                                                 xi->value_len);
+                                if (ret < 0) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = __ocfs2_xattr_set_value_outside(inode,
+                                                                xv,
+                                                                xi->value,
+                                                                xi->value_len);
+                                if (ret < 0) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_xattr_update_entry(inode,
+                                                               xi,
+                                                               xs,
+                                                               offs);
+                                if (ret < 0)
+                                        mlog_errno(ret);
+                                goto out;
+                        } else {
+                                /*
+                                 * If new value need set in local,
+                                 * just trucate old value to zero.
+                                 */
+                                 ret = ocfs2_xattr_value_truncate(inode,
+                                                                 xs->xattr_bh,
+                                                                 xv,
+                                                                 0);
+                                if (ret < 0)
+                                        mlog_errno(ret);
+                        }
+                }
+        }
+        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+                                   OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+                /* set extended attribute in external block. */
+                ret = ocfs2_extend_trans(handle,
+                                         OCFS2_INODE_UPDATE_CREDITS +
+                                         OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
+        /*
+         * Set value in local, include set tree root in local.
+         * This is the first step for value size >INLINE_SIZE.
+         */
+        ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+                ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
+        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+            (flag & OCFS2_INLINE_XATTR_FL)) {
+                struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+                unsigned int xattrsize = osb->s_xattr_inline_size;
+                /*
+                 * Adjust extent record count or inline data size
+                 * to reserve space for extended attribute.
+                 */
+                if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                        struct ocfs2_inline_data *idata = &di->id2.i_data;
+                        le16_add_cpu(&idata->id_count, -xattrsize);
+                } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+                        struct ocfs2_extent_list *el = &di->id2.i_list;
+                        le16_add_cpu(&el->l_count, -(xattrsize /
+                                        sizeof(struct ocfs2_extent_rec)));
+                }
+                di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+        }
+        /* Update xattr flag */
+        spin_lock(&oi->ip_lock);
+        oi->ip_dyn_features |= flag;
+        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+        spin_unlock(&oi->ip_lock);
+        /* Update inode ctime */
+        inode->i_ctime = CURRENT_TIME;
+        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+        if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                /*
+                 * Set value outside in B tree.
+                 * This is the second step for value size > INLINE_SIZE.
+                 */
+                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+                if (ret < 0) {
+                        int ret2;
+                        mlog_errno(ret);
+                        /*
+                         * If set value outside failed, we have to clean
+                         * the junk tree root we have already set in local.
+                         */
+                        ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+                        if (ret2 < 0)
+                                mlog_errno(ret2);
+                }
+        }
+out:
+        return ret;
+}
+static int ocfs2_remove_value_outside(struct inode*inode,
+                                      struct buffer_head *bh,
+                                      struct ocfs2_xattr_header *header)
+{
+        int ret = 0, i;
+        for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+                struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+                if (!ocfs2_xattr_is_local(entry)) {
+                        struct ocfs2_xattr_value_root *xv;
+                        void *val;
+                        val = (void *)header +
+                                le16_to_cpu(entry->xe_name_offset);
+                        xv = (struct ocfs2_xattr_value_root *)
+                                (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+                        ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                return ret;
+                        }
+                }
+        }
+        return ret;
+}
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+                                    struct buffer_head *di_bh)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        struct ocfs2_xattr_header *header;
+        int ret;
+        header = (struct ocfs2_xattr_header *)
+                 ((void *)di + inode->i_sb->s_blocksize -
+                 le16_to_cpu(di->i_xattr_inline_size));
+        ret = ocfs2_remove_value_outside(inode, di_bh, header);
+        return ret;
+}
+static int ocfs2_xattr_block_remove(struct inode *inode,
+                                    struct buffer_head *blk_bh)
+{
+        struct ocfs2_xattr_block *xb;
+        int ret = 0;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+                struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+                ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+        } else
+                ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+        return ret;
+}
+static int ocfs2_xattr_free_block(struct inode *inode,
+                                  u64 block)
+{
+        struct inode *xb_alloc_inode;
+        struct buffer_head *xb_alloc_bh = NULL;
+        struct buffer_head *blk_bh = NULL;
+        struct ocfs2_xattr_block *xb;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle;
+        int ret = 0;
+        u64 blk, bg_blkno;
+        u16 bit;
+        ret = ocfs2_read_block(inode, block, &blk_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*Verify the signature of xattr block*/
+        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        ret = ocfs2_xattr_block_remove(inode, blk_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+        blk = le64_to_cpu(xb->xb_blkno);
+        bit = le16_to_cpu(xb->xb_suballoc_bit);
+        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+        xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+                                EXTENT_ALLOC_SYSTEM_INODE,
+                                le16_to_cpu(xb->xb_suballoc_slot));
+        if (!xb_alloc_inode) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        mutex_lock(&xb_alloc_inode->i_mutex);
+        ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_mutex;
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_unlock;
+        }
+        ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+                                       bit, bg_blkno, 1);
+        if (ret < 0)
+                mlog_errno(ret);
+        ocfs2_commit_trans(osb, handle);
+out_unlock:
+        ocfs2_inode_unlock(xb_alloc_inode, 1);
+        brelse(xb_alloc_bh);
+out_mutex:
+        mutex_unlock(&xb_alloc_inode->i_mutex);
+        iput(xb_alloc_inode);
+out:
+        brelse(blk_bh);
+        return ret;
+}
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+        handle_t *handle;
+        int ret;
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return 0;
+        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+                return 0;
+        if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+                ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (di->i_xattr_loc) {
+                ret = ocfs2_xattr_free_block(inode,
+                                             le64_to_cpu(di->i_xattr_loc));
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+                                   OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, di_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        di->i_xattr_loc = 0;
+        spin_lock(&oi->ip_lock);
+        oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+        spin_unlock(&oi->ip_lock);
+        ret = ocfs2_journal_dirty(handle, di_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+        return ret;
+}
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+                                        struct ocfs2_dinode *di)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+        int free;
+        if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+                return 0;
+        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                struct ocfs2_inline_data *idata = &di->id2.i_data;
+                free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+        } else if (ocfs2_inode_is_fast_symlink(inode)) {
+                free = ocfs2_fast_symlink_chars(inode->i_sb) -
+                        le64_to_cpu(di->i_size);
+        } else {
+                struct ocfs2_extent_list *el = &di->id2.i_list;
+                free = (le16_to_cpu(el->l_count) -
+                        le16_to_cpu(el->l_next_free_rec)) *
+                        sizeof(struct ocfs2_extent_rec);
+        }
+        if (free >= xattrsize)
+                return 1;
+        return 0;
+}
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+                                  int name_index,
+                                  const char *name,
+                                  struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        int ret;
+        int has_space = 0;
+        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+                return 0;
+        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+                down_read(&oi->ip_alloc_sem);
+                has_space = ocfs2_xattr_has_space_inline(inode, di);
+                up_read(&oi->ip_alloc_sem);
+                if (!has_space)
+                        return 0;
+        }
+        xs->xattr_bh = xs->inode_bh;
+        xs->end = (void *)di + inode->i_sb->s_blocksize;
+        if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+                xs->header = (struct ocfs2_xattr_header *)
+                        (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+        else
+                xs->header = (struct ocfs2_xattr_header *)
+                        (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+        xs->base = (void *)xs->header;
+        xs->here = xs->header->xh_entries;
+        /* Find the named attribute. */
+        if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+                ret = ocfs2_xattr_find_entry(name_index, name, xs);
+                if (ret && ret != -ENODATA)
+                        return ret;
+                xs->not_found = ret;
+        }
+        return 0;
+}
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+                                 struct ocfs2_xattr_info *xi,
+                                 struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        int ret;
+        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+                return -ENOSPC;
+        down_write(&oi->ip_alloc_sem);
+        if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+                if (!ocfs2_xattr_has_space_inline(inode, di)) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+        }
+        ret = ocfs2_xattr_set_entry(inode, xi, xs,
+                                (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+out:
+        up_write(&oi->ip_alloc_sem);
+        return ret;
+}
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+                                  int name_index,
+                                  const char *name,
+                                  struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        struct buffer_head *blk_bh = NULL;
+        struct ocfs2_xattr_block *xb;
+        int ret = 0;
+        if (!di->i_xattr_loc)
+                return ret;
+        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /*Verify the signature of xattr block*/
+        if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+                   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+                        ret = -EFAULT;
+                        goto cleanup;
+        }
+        xs->xattr_bh = blk_bh;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+                xs->header = &xb->xb_attrs.xb_header;
+                xs->base = (void *)xs->header;
+                xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+                xs->here = xs->header->xh_entries;
+                ret = ocfs2_xattr_find_entry(name_index, name, xs);
+        } else
+                ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+                                                   name_index,
+                                                   name, xs);
+        if (ret && ret != -ENODATA) {
+                xs->xattr_bh = NULL;
+                goto cleanup;
+        }
+        xs->not_found = ret;
+        return 0;
+cleanup:
+        brelse(blk_bh);
+        return ret;
+}
+/*
+ * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
+ * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
+ * re-initialized.
+ */
+static int ocfs2_restore_xattr_block(struct inode *inode,
+                                     struct ocfs2_xattr_search *xs)
+{
+        int ret;
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_block *xb =
+                (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+        struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+        u16 xb_flags = le16_to_cpu(xb->xb_flags);
+        BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+                le16_to_cpu(el->l_next_free_rec) != 0);
+        handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+               offsetof(struct ocfs2_xattr_block, xb_attrs));
+        xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+        ocfs2_journal_dirty(handle, xs->xattr_bh);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        return ret;
+}
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+                                 struct ocfs2_xattr_info *xi,
+                                 struct ocfs2_xattr_search *xs)
+{
+        struct buffer_head *new_bh = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = NULL;
+        struct ocfs2_xattr_block *xblk = NULL;
+        u16 suballoc_bit_start;
+        u32 num_got;
+        u64 first_blkno;
+        int ret;
+        if (!xs->xattr_bh) {
+                /*
+                 * Alloc one external block for extended attribute
+                 * outside of inode.
+                 */
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                handle = ocfs2_start_trans(osb,
+                                           OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+                                           &suballoc_bit_start, &num_got,
+                                           &first_blkno);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                new_bh = sb_getblk(inode->i_sb, first_blkno);
+                ocfs2_set_new_buffer_uptodate(inode, new_bh);
+                ret = ocfs2_journal_access(handle, inode, new_bh,
+                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                /* Initialize ocfs2_xattr_block */
+                xs->xattr_bh = new_bh;
+                xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+                memset(xblk, 0, inode->i_sb->s_blocksize);
+                strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+                xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+                xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+                xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+                xblk->xb_blkno = cpu_to_le64(first_blkno);
+                xs->header = &xblk->xb_attrs.xb_header;
+                xs->base = (void *)xs->header;
+                xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+                xs->here = xs->header->xh_entries;
+                ret = ocfs2_journal_dirty(handle, new_bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                di->i_xattr_loc = cpu_to_le64(first_blkno);
+                ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+                if (ret < 0)
+                        mlog_errno(ret);
+out_commit:
+                ocfs2_commit_trans(osb, handle);
+out:
+                if (meta_ac)
+                        ocfs2_free_alloc_context(meta_ac);
+                if (ret < 0)
+                        return ret;
+        } else
+                xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+        if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+                /* Set extended attribute into external block */
+                ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+                if (!ret || ret != -ENOSPC)
+                        goto end;
+                ret = ocfs2_xattr_create_index_block(inode, xs);
+                if (ret)
+                        goto end;
+        }
+        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+        if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+                ret = ocfs2_restore_xattr_block(inode, xs);
+end:
+        return ret;
+}
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+                    int name_index,
+                    const char *name,
+                    const void *value,
+                    size_t value_len,
+                    int flags)
+{
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di;
+        int ret;
+        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_info xi = {
+                .name_index = name_index,
+                .name = name,
+                .value = value,
+                .value_len = value_len,
+        };
+        struct ocfs2_xattr_search xis = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_search xbs = {
+                .not_found = -ENODATA,
+        };
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return -EOPNOTSUPP;
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        xis.inode_bh = xbs.inode_bh = di_bh;
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_write(&OCFS2_I(inode)->ip_xattr_sem);
+        /*
+         * Scan inode and external block to find the same name
+         * extended attribute and collect search infomation.
+         */
+        ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+        if (ret)
+                goto cleanup;
+        if (xis.not_found) {
+                ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+                if (ret)
+                        goto cleanup;
+        }
+        if (xis.not_found && xbs.not_found) {
+                ret = -ENODATA;
+                if (flags & XATTR_REPLACE)
+                        goto cleanup;
+                ret = 0;
+                if (!value)
+                        goto cleanup;
+        } else {
+                ret = -EEXIST;
+                if (flags & XATTR_CREATE)
+                        goto cleanup;
+        }
+        if (!value) {
+                /* Remove existing extended attribute */
+                if (!xis.not_found)
+                        ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                else if (!xbs.not_found)
+                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+        } else {
+                /* We always try to set extended attribute into inode first*/
+                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                if (!ret && !xbs.not_found) {
+                        /*
+                         * If succeed and that extended attribute existing in
+                         * external block, then we will remove it.
+                         */
+                        xi.value = NULL;
+                        xi.value_len = 0;
+                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+                } else if (ret == -ENOSPC) {
+                        if (di->i_xattr_loc && !xbs.xattr_bh) {
+                                ret = ocfs2_xattr_block_find(inode, name_index,
+                                                             name, &xbs);
+                                if (ret)
+                                        goto cleanup;
+                        }
+                        /*
+                         * If no space in inode, we will set extended attribute
+                         * into external block.
+                         */
+                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+                        if (ret)
+                                goto cleanup;
+                        if (!xis.not_found) {
+                                /*
+                                 * If succeed and that extended attribute
+                                 * existing in inode, we will remove it.
+                                 */
+                                xi.value = NULL;
+                                xi.value_len = 0;
+                                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                        }
+                }
+        }
+cleanup:
+        up_write(&OCFS2_I(inode)->ip_xattr_sem);
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+        brelse(xbs.xattr_bh);
+        for (i = 0; i < blk_per_bucket; i++)
+                brelse(xbs.bucket.bhs[i]);
+        return ret;
+}
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+                               u32 name_hash,
+                               u64 *p_blkno,
+                               u32 *e_cpos,
+                               u32 *num_clusters,
+                               struct ocfs2_extent_list *el)
+{
+        int ret = 0, i;
+        struct buffer_head *eb_bh = NULL;
+        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_rec *rec = NULL;
+        u64 e_blkno = 0;
+        if (el->l_tree_depth) {
+                ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+                el = &eb->h_list;
+                if (el->l_tree_depth) {
+                        ocfs2_error(inode->i_sb,
+                                    "Inode %lu has non zero tree depth in "
+                                    "xattr tree block %llu\n", inode->i_ino,
+                                    (unsigned long long)eb_bh->b_blocknr);
+                        ret = -EROFS;
+                        goto out;
+                }
+        }
+        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+                rec = &el->l_recs[i];
+                if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+                        e_blkno = le64_to_cpu(rec->e_blkno);
+                        break;
+                }
+        }
+        if (!e_blkno) {
+                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                            "record (%u, %u, 0) in xattr", inode->i_ino,
+                            le32_to_cpu(rec->e_cpos),
+                            ocfs2_rec_clusters(el, rec));
+                ret = -EROFS;
+                goto out;
+        }
+        *p_blkno = le64_to_cpu(rec->e_blkno);
+        *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+        if (e_cpos)
+                *e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+        brelse(eb_bh);
+        return ret;
+}
+typedef int (xattr_bucket_func)(struct inode *inode,
+                                struct ocfs2_xattr_bucket *bucket,
+                                void *para);
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+                                   struct buffer_head *header_bh,
+                                   int name_index,
+                                   const char *name,
+                                   u32 name_hash,
+                                   u16 *xe_index,
+                                   int *found)
+{
+        int i, ret = 0, cmp = 1, block_off, new_offset;
+        struct ocfs2_xattr_header *xh =
+                        (struct ocfs2_xattr_header *)header_bh->b_data;
+        size_t name_len = strlen(name);
+        struct ocfs2_xattr_entry *xe = NULL;
+        struct buffer_head *name_bh = NULL;
+        char *xe_name;
+        /*
+         * We don't use binary search in the bucket because there
+         * may be multiple entries with the same name hash.
+         */
+        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+                xe = &xh->xh_entries[i];
+                if (name_hash > le32_to_cpu(xe->xe_name_hash))
+                        continue;
+                else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+                        break;
+                cmp = name_index - ocfs2_xattr_get_type(xe);
+                if (!cmp)
+                        cmp = name_len - xe->xe_name_len;
+                if (cmp)
+                        continue;
+                ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                        xh,
+                                                        i,
+                                                        &block_off,
+                                                        &new_offset);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
+                                       &name_bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                xe_name = name_bh->b_data + new_offset;
+                cmp = memcmp(name, xe_name, name_len);
+                brelse(name_bh);
+                name_bh = NULL;
+                if (cmp == 0) {
+                        *xe_index = i;
+                        *found = 1;
+                        ret = 0;
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+                                   int name_index,
+                                   const char *name,
+                                   u32 name_hash,
+                                   u64 p_blkno,
+                                   u32 first_hash,
+                                   u32 num_clusters,
+                                   struct ocfs2_xattr_search *xs)
+{
+        int ret, found = 0;
+        struct buffer_head *bh = NULL;
+        struct buffer_head *lower_bh = NULL;
+        struct ocfs2_xattr_header *xh = NULL;
+        struct ocfs2_xattr_entry *xe = NULL;
+        u16 index = 0;
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        int low_bucket = 0, bucket, high_bucket;
+        u32 last_hash;
+        u64 blkno;
+        ret = ocfs2_read_block(inode, p_blkno, &bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+        while (low_bucket <= high_bucket) {
+                brelse(bh);
+                bh = NULL;
+                bucket = (low_bucket + high_bucket) / 2;
+                blkno = p_blkno + bucket * blk_per_bucket;
+                ret = ocfs2_read_block(inode, blkno, &bh);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                xh = (struct ocfs2_xattr_header *)bh->b_data;
+                xe = &xh->xh_entries[0];
+                if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+                        high_bucket = bucket - 1;
+                        continue;
+                }
+                /*
+                 * Check whether the hash of the last entry in our
+                 * bucket is larger than the search one. for an empty
+                 * bucket, the last one is also the first one.
+                 */
+                if (xh->xh_count)
+                        xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+                last_hash = le32_to_cpu(xe->xe_name_hash);
+                /* record lower_bh which may be the insert place. */
+                brelse(lower_bh);
+                lower_bh = bh;
+                bh = NULL;
+                if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+                        low_bucket = bucket + 1;
+                        continue;
+                }
+                /* the searched xattr should reside in this bucket if exists. */
+                ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+                                              name_index, name, name_hash,
+                                              &index, &found);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                break;
+        }
+        /*
+         * Record the bucket we have found.
+         * When the xattr's hash value is in the gap of 2 buckets, we will
+         * always set it to the previous bucket.
+         */
+        if (!lower_bh) {
+                /*
+                 * We can't find any bucket whose first name_hash is less
+                 * than the find name_hash.
+                 */
+                BUG_ON(bh->b_blocknr != p_blkno);
+                lower_bh = bh;
+                bh = NULL;
+        }
+        xs->bucket.bhs[0] = lower_bh;
+        xs->bucket.xh = (struct ocfs2_xattr_header *)
+                                        xs->bucket.bhs[0]->b_data;
+        lower_bh = NULL;
+        xs->header = xs->bucket.xh;
+        xs->base = xs->bucket.bhs[0]->b_data;
+        xs->end = xs->base + inode->i_sb->s_blocksize;
+        if (found) {
+                /*
+                 * If we have found the xattr enty, read all the blocks in
+                 * this bucket.
+                 */
+                ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
+                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                                        OCFS2_BH_CACHED);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                xs->here = &xs->header->xh_entries[index];
+                mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+                     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+        } else
+                ret = -ENODATA;
+out:
+        brelse(bh);
+        brelse(lower_bh);
+        return ret;
+}
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+                                        struct buffer_head *root_bh,
+                                        int name_index,
+                                        const char *name,
+                                        struct ocfs2_xattr_search *xs)
+{
+        int ret;
+        struct ocfs2_xattr_block *xb =
+                        (struct ocfs2_xattr_block *)root_bh->b_data;
+        struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+        struct ocfs2_extent_list *el = &xb_root->xt_list;
+        u64 p_blkno = 0;
+        u32 first_hash, num_clusters = 0;
+        u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+        if (le16_to_cpu(el->l_next_free_rec) == 0)
+                return -ENODATA;
+        mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+             name, name_hash, name_index);
+        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+                                  &num_clusters, el);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+        mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+             "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+        ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+                                      p_blkno, first_hash, num_clusters, xs);
+out:
+        return ret;
+}
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+                                       u64 blkno,
+                                       u32 clusters,
+                                       xattr_bucket_func *func,
+                                       void *para)
+{
+        int i, j, ret = 0;
+        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+        u32 num_buckets = clusters * bpc;
+        struct ocfs2_xattr_bucket bucket;
+        memset(&bucket, 0, sizeof(bucket));
+        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+             clusters, blkno);
+        for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+                ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+                                        bucket.bhs, OCFS2_BH_CACHED);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+                /*
+                 * The real bucket num in this series of blocks is stored
+                 * in the 1st bucket.
+                 */
+                if (i == 0)
+                        num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+                mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+                     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+                if (func) {
+                        ret = func(inode, &bucket, para);
+                        if (ret) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                }
+                for (j = 0; j < blk_per_bucket; j++)
+                        brelse(bucket.bhs[j]);
+                memset(&bucket, 0, sizeof(bucket));
+        }
+out:
+        for (j = 0; j < blk_per_bucket; j++)
+                brelse(bucket.bhs[j]);
+        return ret;
+}
+struct ocfs2_xattr_tree_list {
+        char *buffer;
+        size_t buffer_size;
+        size_t result;
+};
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+                                             struct ocfs2_xattr_header *xh,
+                                             int index,
+                                             int *block_off,
+                                             int *new_offset)
+{
+        u16 name_offset;
+        if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+                return -EINVAL;
+        name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+        *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
+        *new_offset = name_offset % inode->i_sb->s_blocksize;
+        return 0;
+}
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+                                   struct ocfs2_xattr_bucket *bucket,
+                                   void *para)
+{
+        int ret = 0, type;
+        struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+        int i, block_off, new_offset;
+        const char *prefix, *name;
+        for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+                struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+                type = ocfs2_xattr_get_type(entry);
+                prefix = ocfs2_xattr_prefix(type);
+                if (prefix) {
+                        ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                                bucket->xh,
+                                                                i,
+                                                                &block_off,
+                                                                &new_offset);
+                        if (ret)
+                                break;
+                        name = (const char *)bucket->bhs[block_off]->b_data +
+                                new_offset;
+                        ret = ocfs2_xattr_list_entry(xl->buffer,
+                                                     xl->buffer_size,
+                                                     &xl->result,
+                                                     prefix, name,
+                                                     entry->xe_name_len);
+                        if (ret)
+                                break;
+                }
+        }
+        return ret;
+}
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+                                             struct ocfs2_xattr_tree_root *xt,
+                                             char *buffer,
+                                             size_t buffer_size)
+{
+        struct ocfs2_extent_list *el = &xt->xt_list;
+        int ret = 0;
+        u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+        u64 p_blkno = 0;
+        struct ocfs2_xattr_tree_list xl = {
+                .buffer = buffer,
+                .buffer_size = buffer_size,
+                .result = 0,
+        };
+        if (le16_to_cpu(el->l_next_free_rec) == 0)
+                return 0;
+        while (name_hash > 0) {
+                ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+                                          &e_cpos, &num_clusters, el);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+                                                  ocfs2_list_xattr_bucket,
+                                                  &xl);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                if (e_cpos == 0)
+                        break;
+                name_hash = e_cpos - 1;
+        }
+        ret = xl.result;
+out:
+        return ret;
+}
+static int cmp_xe(const void *a, const void *b)
+{
+        const struct ocfs2_xattr_entry *l = a, *r = b;
+        u32 l_hash = le32_to_cpu(l->xe_name_hash);
+        u32 r_hash = le32_to_cpu(r->xe_name_hash);
+        if (l_hash > r_hash)
+                return 1;
+        if (l_hash < r_hash)
+                return -1;
+        return 0;
+}
+static void swap_xe(void *a, void *b, int size)
+{
+        struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+        tmp = *l;
+        memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+        memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+                                           struct buffer_head *xb_bh,
+                                           struct buffer_head *xh_bh,
+                                           struct buffer_head *data_bh)
+{
+        int i, blocksize = inode->i_sb->s_blocksize;
+        u16 offset, size, off_change;
+        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_xattr_block *xb =
+                                (struct ocfs2_xattr_block *)xb_bh->b_data;
+        struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+        struct ocfs2_xattr_header *xh =
+                                (struct ocfs2_xattr_header *)xh_bh->b_data;
+        u16 count = le16_to_cpu(xb_xh->xh_count);
+        char *target = xh_bh->b_data, *src = xb_bh->b_data;
+        mlog(0, "cp xattr from block %llu to bucket %llu\n",
+             (unsigned long long)xb_bh->b_blocknr,
+             (unsigned long long)xh_bh->b_blocknr);
+        memset(xh_bh->b_data, 0, blocksize);
+        if (data_bh)
+                memset(data_bh->b_data, 0, blocksize);
+        /*
+         * Since the xe_name_offset is based on ocfs2_xattr_header,
+         * there is a offset change corresponding to the change of
+         * ocfs2_xattr_header's position.
+         */
+        off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+        xe = &xb_xh->xh_entries[count - 1];
+        offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+        size = blocksize - offset;
+        /* copy all the names and values. */
+        if (data_bh)
+                target = data_bh->b_data;
+        memcpy(target + offset, src + offset, size);
+        /* Init new header now. */
+        xh->xh_count = xb_xh->xh_count;
+        xh->xh_num_buckets = cpu_to_le16(1);
+        xh->xh_name_value_len = cpu_to_le16(size);
+        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+        /* copy all the entries. */
+        target = xh_bh->b_data;
+        offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+        size = count * sizeof(struct ocfs2_xattr_entry);
+        memcpy(target + offset, (char *)xb_xh + offset, size);
+        /* Change the xe offset for all the xe because of the move. */
+        off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+                 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+        for (i = 0; i < count; i++)
+                le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+        mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+             offset, size, off_change);
+        sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+             cmp_xe, swap_xe);
+}
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+                                           struct ocfs2_xattr_search *xs,
+                                           struct buffer_head *old_bh,
+                                           struct buffer_head *new_bh)
+{
+        int ret = 0;
+        char *buf = old_bh->b_data;
+        struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+        struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+        int i, blocksize = inode->i_sb->s_blocksize;
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        xs->bucket.bhs[0] = new_bh;
+        get_bh(new_bh);
+        xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
+        xs->header = xs->bucket.xh;
+        xs->base = new_bh->b_data;
+        xs->end = xs->base + inode->i_sb->s_blocksize;
+        if (!xs->not_found) {
+                if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+                        ret = ocfs2_read_blocks(inode,
+                                        xs->bucket.bhs[0]->b_blocknr + 1,
+                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                                        OCFS2_BH_CACHED);
+                        if (ret) {
+                                mlog_errno(ret);
+                                return ret;
+                        }
+                        i = xs->here - old_xh->xh_entries;
+                        xs->here = &xs->header->xh_entries[i];
+                }
+        }
+        return ret;
+}
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+                                          struct ocfs2_xattr_search *xs)
+{
+        int ret, credits = OCFS2_SUBALLOC_ALLOC;
+        u32 bit_off, len;
+        u64 blkno;
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_alloc_context *data_ac;
+        struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+        struct buffer_head *xb_bh = xs->xattr_bh;
+        struct ocfs2_xattr_block *xb =
+                        (struct ocfs2_xattr_block *)xb_bh->b_data;
+        struct ocfs2_xattr_tree_root *xr;
+        u16 xb_flags = le16_to_cpu(xb->xb_flags);
+        u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        mlog(0, "create xattr index block for %llu\n",
+             (unsigned long long)xb_bh->b_blocknr);
+        BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        /*
+         * XXX:
+         * We can use this lock for now, and maybe move to a dedicated mutex
+         * if performance becomes a problem later.
+         */
+        down_write(&oi->ip_alloc_sem);
+        /*
+         * 3 more credits, one for xattr block update, one for the 1st block
+         * of the new xattr bucket and one for the value/data.
+         */
+        credits += 3;
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out_sem;
+        }
+        ret = ocfs2_journal_access(handle, inode, xb_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /*
+         * The bucket may spread in many blocks, and
+         * we will only touch the 1st block and the last block
+         * in the whole bucket(one for entry and one for data).
+         */
+        blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+        mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+        xh_bh = sb_getblk(inode->i_sb, blkno);
+        if (!xh_bh) {
+                ret = -EIO;
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+        ret = ocfs2_journal_access(handle, inode, xh_bh,
+                                   OCFS2_JOURNAL_ACCESS_CREATE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        if (bpb > 1) {
+                data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
+                if (!data_bh) {
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+                ocfs2_set_new_buffer_uptodate(inode, data_bh);
+                ret = ocfs2_journal_access(handle, inode, data_bh,
+                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out_commit;
+                }
+        }
+        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+        ocfs2_journal_dirty(handle, xh_bh);
+        if (data_bh)
+                ocfs2_journal_dirty(handle, data_bh);
+        ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+        /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+               offsetof(struct ocfs2_xattr_block, xb_attrs));
+        xr = &xb->xb_attrs.xb_root;
+        xr->xt_clusters = cpu_to_le32(1);
+        xr->xt_last_eb_blk = 0;
+        xr->xt_list.l_tree_depth = 0;
+        xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+        xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+        xr->xt_list.l_recs[0].e_cpos = 0;
+        xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+        xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+        xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+        ret = ocfs2_journal_dirty(handle, xb_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out_sem:
+        up_write(&oi->ip_alloc_sem);
+out:
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        brelse(xh_bh);
+        brelse(data_bh);
+        return ret;
+}
+static int cmp_xe_offset(const void *a, const void *b)
+{
+        const struct ocfs2_xattr_entry *l = a, *r = b;
+        u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+        u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+        if (l_name_offset < r_name_offset)
+                return 1;
+        if (l_name_offset > r_name_offset)
+                return -1;
+        return 0;
+}
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+                                     struct ocfs2_xattr_bucket *bucket)
+{
+        int ret, i;
+        size_t end, offset, len, value_len;
+        struct ocfs2_xattr_header *xh;
+        char *entries, *buf, *bucket_buf = NULL;
+        u64 blkno = bucket->bhs[0]->b_blocknr;
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        u16 xh_free_start;
+        size_t blocksize = inode->i_sb->s_blocksize;
+        handle_t *handle;
+        struct buffer_head **bhs;
+        struct ocfs2_xattr_entry *xe;
+        bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+                        GFP_NOFS);
+        if (!bhs)
+                return -ENOMEM;
+        ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs,
+                                OCFS2_BH_CACHED);
+        if (ret)
+                goto out;
+        /*
+         * In order to make the operation more efficient and generic,
+         * we copy all the blocks into a contiguous memory and do the
+         * defragment there, so if anything is error, we will not touch
+         * the real block.
+         */
+        bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+        if (!bucket_buf) {
+                ret = -EIO;
+                goto out;
+        }
+        buf = bucket_buf;
+        for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+                memcpy(buf, bhs[i]->b_data, blocksize);
+        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        for (i = 0; i < blk_per_bucket; i++) {
+                ret = ocfs2_journal_access(handle, inode, bhs[i],
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto commit;
+                }
+        }
+        xh = (struct ocfs2_xattr_header *)bucket_buf;
+        entries = (char *)xh->xh_entries;
+        xh_free_start = le16_to_cpu(xh->xh_free_start);
+        mlog(0, "adjust xattr bucket in %llu, count = %u, "
+             "xh_free_start = %u, xh_name_value_len = %u.\n",
+             blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+             le16_to_cpu(xh->xh_name_value_len));
+        /*
+         * sort all the entries by their offset.
+         * the largest will be the first, so that we can
+         * move them to the end one by one.
+         */
+        sort(entries, le16_to_cpu(xh->xh_count),
+             sizeof(struct ocfs2_xattr_entry),
+             cmp_xe_offset, swap_xe);
+        /* Move all name/values to the end of the bucket. */
+        xe = xh->xh_entries;
+        end = OCFS2_XATTR_BUCKET_SIZE;
+        for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+                offset = le16_to_cpu(xe->xe_name_offset);
+                if (ocfs2_xattr_is_local(xe))
+                        value_len = OCFS2_XATTR_SIZE(
+                                        le64_to_cpu(xe->xe_value_size));
+                else
+                        value_len = OCFS2_XATTR_ROOT_SIZE;
+                len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+                /*
+                 * We must make sure that the name/value pair
+                 * exist in the same block. So adjust end to
+                 * the previous block end if needed.
+                 */
+                if (((end - len) / blocksize !=
+                        (end - 1) / blocksize))
+                        end = end - end % blocksize;
+                if (end > offset + len) {
+                        memmove(bucket_buf + end - len,
+                                bucket_buf + offset, len);
+                        xe->xe_name_offset = cpu_to_le16(end - len);
+                }
+                mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+                                "bucket %llu\n", (unsigned long long)blkno);
+                end -= len;
+        }
+        mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+                        "bucket %llu\n", (unsigned long long)blkno);
+        if (xh_free_start == end)
+                goto commit;
+        memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+        xh->xh_free_start = cpu_to_le16(end);
+        /* sort the entries by their name_hash. */
+        sort(entries, le16_to_cpu(xh->xh_count),
+             sizeof(struct ocfs2_xattr_entry),
+             cmp_xe, swap_xe);
+        buf = bucket_buf;
+        for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+                memcpy(bhs[i]->b_data, buf, blocksize);
+                ocfs2_journal_dirty(handle, bhs[i]);
+        }
+commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+        if (bhs) {
+                for (i = 0; i < blk_per_bucket; i++)
+                        brelse(bhs[i]);
+        }
+        kfree(bhs);
+        kfree(bucket_buf);
+        return ret;
+}
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh is the first buffer_head of a series of bucket in the same
+ * extent rec and header_bh is the header of one bucket in this cluster.
+ * They will be updated if we move the data header_bh contains to the new
+ * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+                                               handle_t *handle,
+                                               struct buffer_head **first_bh,
+                                               struct buffer_head **header_bh,
+                                               u64 new_blkno,
+                                               u64 prev_blkno,
+                                               u32 num_clusters,
+                                               u32 *first_hash)
+{
+        int i, ret, credits;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+        int blocksize = inode->i_sb->s_blocksize;
+        struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+        struct ocfs2_xattr_header *new_xh;
+        struct ocfs2_xattr_header *xh =
+                        (struct ocfs2_xattr_header *)((*first_bh)->b_data);
+        BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
+        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+        prev_bh = *first_bh;
+        get_bh(prev_bh);
+        xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+        prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+             prev_blkno, new_blkno);
+        /*
+         * We need to update the 1st half of the new cluster and
+         * 1 more for the update of the 1st bucket of the previous
+         * extent record.
+         */
+        credits = bpc / 2 + 1;
+        ret = ocfs2_extend_trans(handle, credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, prev_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+                old_bh = new_bh = NULL;
+                new_bh = sb_getblk(inode->i_sb, new_blkno);
+                if (!new_bh) {
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ocfs2_set_new_buffer_uptodate(inode, new_bh);
+                ret = ocfs2_journal_access(handle, inode, new_bh,
+                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        brelse(new_bh);
+                        goto out;
+                }
+                ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        brelse(new_bh);
+                        goto out;
+                }
+                memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+                if (i == 0) {
+                        new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+                        new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+                        if (first_hash)
+                                *first_hash = le32_to_cpu(
+                                        new_xh->xh_entries[0].xe_name_hash);
+                        new_first_bh = new_bh;
+                        get_bh(new_first_bh);
+                }
+                ocfs2_journal_dirty(handle, new_bh);
+                if (*header_bh == old_bh) {
+                        brelse(*header_bh);
+                        *header_bh = new_bh;
+                        get_bh(*header_bh);
+                        brelse(*first_bh);
+                        *first_bh = new_first_bh;
+                        get_bh(*first_bh);
+                }
+                brelse(new_bh);
+                brelse(old_bh);
+        }
+        le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
+        ocfs2_journal_dirty(handle, prev_bh);
+out:
+        brelse(prev_bh);
+        brelse(new_first_bh);
+        return ret;
+}
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+                                   u64 blkno,
+                                   struct buffer_head **bhs,
+                                   int new)
+{
+        int ret = 0;
+        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        if (!new)
+                return ocfs2_read_blocks(inode, blkno,
+                                         blk_per_bucket, bhs,
+                                         OCFS2_BH_CACHED);
+        for (i = 0; i < blk_per_bucket; i++) {
+                bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+                if (bhs[i] == NULL) {
+                        ret = -EIO;
+                        mlog_errno(ret);
+                        break;
+                }
+                ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+        }
+        return ret;
+}
+/*
+ * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ */
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+                                   handle_t *handle,
+                                   u64 blk,
+                                   u64 new_blk,
+                                   u32 *first_hash,
+                                   int new_bucket_head)
+{
+        int ret, i;
+        u16 count, start, len, name_value_len, xe_len, name_offset;
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct buffer_head **s_bhs, **t_bhs = NULL;
+        struct ocfs2_xattr_header *xh;
+        struct ocfs2_xattr_entry *xe;
+        int blocksize = inode->i_sb->s_blocksize;
+        mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+             blk, new_blk);
+        s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        if (!s_bhs)
+                return -ENOMEM;
+        ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        if (!t_bhs) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for (i = 0; i < blk_per_bucket; i++) {
+                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        /* copy the whole bucket to the new first. */
+        for (i = 0; i < blk_per_bucket; i++)
+                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+        /* update the new bucket. */
+        xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+        count = le16_to_cpu(xh->xh_count);
+        start = count / 2;
+        /*
+         * Calculate the total name/value len and xh_free_start for
+         * the old bucket first.
+         */
+        name_offset = OCFS2_XATTR_BUCKET_SIZE;
+        name_value_len = 0;
+        for (i = 0; i < start; i++) {
+                xe = &xh->xh_entries[i];
+                xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                if (ocfs2_xattr_is_local(xe))
+                        xe_len +=
+                           OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+                else
+                        xe_len += OCFS2_XATTR_ROOT_SIZE;
+                name_value_len += xe_len;
+                if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+                        name_offset = le16_to_cpu(xe->xe_name_offset);
+        }
+        /*
+         * Now begin the modification to the new bucket.
+         *
+         * In the new bucket, We just move the xattr entry to the beginning
+         * and don't touch the name/value. So there will be some holes in the
+         * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+         * called.
+         */
+        xe = &xh->xh_entries[start];
+        len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+        mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+             (int)((char *)xe - (char *)xh),
+             (int)((char *)xh->xh_entries - (char *)xh));
+        memmove((char *)xh->xh_entries, (char *)xe, len);
+        xe = &xh->xh_entries[count - start];
+        len = sizeof(struct ocfs2_xattr_entry) * start;
+        memset((char *)xe, 0, len);
+        le16_add_cpu(&xh->xh_count, -start);
+        le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+        /* Calculate xh_free_start for the new bucket. */
+        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+                xe = &xh->xh_entries[i];
+                xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                if (ocfs2_xattr_is_local(xe))
+                        xe_len +=
+                           OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+                else
+                        xe_len += OCFS2_XATTR_ROOT_SIZE;
+                if (le16_to_cpu(xe->xe_name_offset) <
+                    le16_to_cpu(xh->xh_free_start))
+                        xh->xh_free_start = xe->xe_name_offset;
+        }
+        /* set xh->xh_num_buckets for the new xh. */
+        if (new_bucket_head)
+                xh->xh_num_buckets = cpu_to_le16(1);
+        else
+                xh->xh_num_buckets = 0;
+        for (i = 0; i < blk_per_bucket; i++) {
+                ocfs2_journal_dirty(handle, t_bhs[i]);
+                if (ret)
+                        mlog_errno(ret);
+        }
+        /* store the first_hash of the new bucket. */
+        if (first_hash)
+                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+        /*
+         * Now only update the 1st block of the old bucket.
+         * Please note that the entry has been sorted already above.
+         */
+        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        memset(&xh->xh_entries[start], 0,
+               sizeof(struct ocfs2_xattr_entry) * (count - start));
+        xh->xh_count = cpu_to_le16(start);
+        xh->xh_free_start = cpu_to_le16(name_offset);
+        xh->xh_name_value_len = cpu_to_le16(name_value_len);
+        ocfs2_journal_dirty(handle, s_bhs[0]);
+        if (ret)
+                mlog_errno(ret);
+out:
+        if (s_bhs) {
+                for (i = 0; i < blk_per_bucket; i++)
+                        brelse(s_bhs[i]);
+        }
+        kfree(s_bhs);
+        if (t_bhs) {
+                for (i = 0; i < blk_per_bucket; i++)
+                        brelse(t_bhs[i]);
+        }
+        kfree(t_bhs);
+        return ret;
+}
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+                                 handle_t *handle,
+                                 u64 s_blkno,
+                                 u64 t_blkno,
+                                 int t_is_new)
+{
+        int ret, i;
+        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        int blocksize = inode->i_sb->s_blocksize;
+        struct buffer_head **s_bhs, **t_bhs = NULL;
+        BUG_ON(s_blkno == t_blkno);
+        mlog(0, "cp bucket %llu to %llu, target is %d\n",
+             s_blkno, t_blkno, t_is_new);
+        s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+                        GFP_NOFS);
+        if (!s_bhs)
+                return -ENOMEM;
+        ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+        if (ret)
+                goto out;
+        t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+                        GFP_NOFS);
+        if (!t_bhs) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+        if (ret)
+                goto out;
+        for (i = 0; i < blk_per_bucket; i++) {
+                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret)
+                        goto out;
+        }
+        for (i = 0; i < blk_per_bucket; i++) {
+                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+                ocfs2_journal_dirty(handle, t_bhs[i]);
+        }
+out:
+        if (s_bhs) {
+                for (i = 0; i < blk_per_bucket; i++)
+                        brelse(s_bhs[i]);
+        }
+        kfree(s_bhs);
+        if (t_bhs) {
+                for (i = 0; i < blk_per_bucket; i++)
+                        brelse(t_bhs[i]);
+        }
+        kfree(t_bhs);
+        return ret;
+}
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_num_buckets will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+                                  handle_t *handle,
+                                  struct buffer_head *first_bh,
+                                  u64 src_blk,
+                                  u64 to_blk,
+                                  u32 *first_hash)
+{
+        int i, ret, credits;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_header *xh;
+        u64 to_blk_start = to_blk;
+        mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+        /*
+         * We need to update the new cluster and 1 more for the update of
+         * the 1st bucket of the previous extent rec.
+         */
+        credits = bpc + 1;
+        ret = ocfs2_extend_trans(handle, credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, first_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        for (i = 0; i < num_buckets; i++) {
+                ret = ocfs2_cp_xattr_bucket(inode, handle,
+                                            src_blk, to_blk, 1);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        }
+        /* update the old bucket header. */
+        xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+        le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+        ocfs2_journal_dirty(handle, first_bh);
+        /* update the new bucket header. */
+        ret = ocfs2_read_block(inode, to_blk_start, &bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        xh->xh_num_buckets = cpu_to_le16(num_buckets);
+        ocfs2_journal_dirty(handle, bh);
+        if (first_hash)
+                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+        brelse(bh);
+        return ret;
+}
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_half_xattr_cluster(struct inode *inode,
+                                    handle_t *handle,
+                                    u64 prev_blk,
+                                    u64 new_blk,
+                                    u32 *first_hash)
+{
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        int ret, credits = 2 * blk_per_bucket;
+        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+        ret = ocfs2_extend_trans(handle, credits);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        /* Move half of the xattr in start_blk to the next bucket. */
+        return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+                                        new_blk, first_hash, 1);
+}
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to the
+ *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+                                            handle_t *handle,
+                                            struct buffer_head **first_bh,
+                                            struct buffer_head **header_bh,
+                                            u64 new_blk,
+                                            u64 prev_blk,
+                                            u32 prev_clusters,
+                                            u32 *v_start,
+                                            int *extend)
+{
+        int ret = 0;
+        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+             prev_blk, prev_clusters, new_blk);
+        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+                                                          handle,
+                                                          first_bh,
+                                                          header_bh,
+                                                          new_blk,
+                                                          prev_blk,
+                                                          prev_clusters,
+                                                          v_start);
+        else {
+                u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+                if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+                        ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+                                                     last_blk, new_blk,
+                                                     v_start);
+                else {
+                        ret = ocfs2_half_xattr_cluster(inode, handle,
+                                                       last_blk, new_blk,
+                                                       v_start);
+                        if ((*header_bh)->b_blocknr == last_blk && extend)
+                                *extend = 0;
+                }
+        }
+        return ret;
+}
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+                                       struct buffer_head *root_bh,
+                                       struct buffer_head **first_bh,
+                                       struct buffer_head **header_bh,
+                                       u32 *num_clusters,
+                                       u32 prev_cpos,
+                                       u64 prev_blkno,
+                                       int *extend)
+{
+        int ret, credits;
+        u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        u32 prev_clusters = *num_clusters;
+        u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+        u64 block;
+        handle_t *handle = NULL;
+        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_extent_tree et;
+        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+             "previous xattr blkno = %llu\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+             prev_cpos, prev_blkno);
+        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+        ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+                                    &data_ac, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                goto leave;
+        }
+        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+                                            clusters_to_add);
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto leave;
+        }
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto leave;
+        }
+        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+                                     clusters_to_add, &bit_off, &num_bits);
+        if (ret < 0) {
+                if (ret != -ENOSPC)
+                        mlog_errno(ret);
+                goto leave;
+        }
+        BUG_ON(num_bits > clusters_to_add);
+        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+        mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (prev_blkno + prev_clusters * bpc == block &&
+            (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+             OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+                /*
+                 * If this cluster is contiguous with the old one and
+                 * adding this new cluster, we don't surpass the limit of
+                 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+                 * initialized and used like other buckets in the previous
+                 * cluster.
+                 * So add it as a contiguous one. The caller will handle
+                 * its init process.
+                 */
+                v_start = prev_cpos + prev_clusters;
+                *num_clusters = prev_clusters + num_bits;
+                mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+                     num_bits);
+        } else {
+                ret = ocfs2_adjust_xattr_cross_cluster(inode,
+                                                       handle,
+                                                       first_bh,
+                                                       header_bh,
+                                                       block,
+                                                       prev_blkno,
+                                                       prev_clusters,
+                                                       &v_start,
+                                                       extend);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto leave;
+                }
+        }
+        if (handle->h_buffer_credits < credits) {
+                /*
+                 * The journal has been restarted before, and don't
+                 * have enough space for the insertion, so extend it
+                 * here.
+                 */
+                ret = ocfs2_extend_trans(handle, credits);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto leave;
+                }
+        }
+        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+             num_bits, block, v_start);
+        ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+                                  num_bits, 0, meta_ac);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto leave;
+        }
+        ret = ocfs2_journal_dirty(handle, root_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto leave;
+        }
+leave:
+        if (handle)
+                ocfs2_commit_trans(osb, handle);
+        if (data_ac)
+                ocfs2_free_alloc_context(data_ac);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
+/*
+ * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+                                     struct buffer_head *first_bh,
+                                     struct buffer_head *start_bh,
+                                     u32 num_clusters)
+{
+        int ret, credits;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        u64 start_blk = start_bh->b_blocknr, end_blk;
+        u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+        handle_t *handle;
+        struct ocfs2_xattr_header *first_xh =
+                                (struct ocfs2_xattr_header *)first_bh->b_data;
+        u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+             "from %llu, len = %u\n", start_blk,
+             (unsigned long long)first_bh->b_blocknr, num_clusters);
+        BUG_ON(bucket >= num_buckets);
+        end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+        /*
+         * We will touch all the buckets after the start_bh(include it).
+         * Add one more bucket and modify the first_bh.
+         */
+        credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+        handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, first_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto commit;
+        }
+        while (end_blk != start_blk) {
+                ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+                                            end_blk + blk_per_bucket, 0);
+                if (ret)
+                        goto commit;
+                end_blk -= blk_per_bucket;
+        }
+        /* Move half of the xattr in start_blk to the next bucket. */
+        ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+                                      start_blk + blk_per_bucket, NULL, 0);
+        le16_add_cpu(&first_xh->xh_num_buckets, 1);
+        ocfs2_journal_dirty(handle, first_bh);
+commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        return ret;
+}
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block.
+ * We will move all the buckets starting from header_bh to the next place. As
+ * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * We will allocate a new cluster if current cluster is full and adjust
+ * header_bh and first_bh if the insert place is moved to the new cluster.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+                                      struct buffer_head *xb_bh,
+                                      struct buffer_head *header_bh)
+{
+        struct ocfs2_xattr_header *first_xh = NULL;
+        struct buffer_head *first_bh = NULL;
+        struct ocfs2_xattr_block *xb =
+                        (struct ocfs2_xattr_block *)xb_bh->b_data;
+        struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+        struct ocfs2_extent_list *el = &xb_root->xt_list;
+        struct ocfs2_xattr_header *xh =
+                        (struct ocfs2_xattr_header *)header_bh->b_data;
+        u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int ret, num_buckets, extend = 1;
+        u64 p_blkno;
+        u32 e_cpos, num_clusters;
+        mlog(0, "Add new xattr bucket starting form %llu\n",
+             (unsigned long long)header_bh->b_blocknr);
+        /*
+         * Add refrence for header_bh here because it may be
+         * changed in ocfs2_add_new_xattr_cluster and we need
+         * to free it in the end.
+         */
+        get_bh(header_bh);
+        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+                                  &num_clusters, el);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+        first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+        if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+                ret = ocfs2_add_new_xattr_cluster(inode,
+                                                  xb_bh,
+                                                  &first_bh,
+                                                  &header_bh,
+                                                  &num_clusters,
+                                                  e_cpos,
+                                                  p_blkno,
+                                                  &extend);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (extend)
+                ret = ocfs2_extend_xattr_bucket(inode,
+                                                first_bh,
+                                                header_bh,
+                                                num_clusters);
+        if (ret)
+                mlog_errno(ret);
+out:
+        brelse(first_bh);
+        brelse(header_bh);
+        return ret;
+}
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+                                        struct ocfs2_xattr_bucket *bucket,
+                                        int offs)
+{
+        int block_off = offs >> inode->i_sb->s_blocksize_bits;
+        offs = offs % inode->i_sb->s_blocksize;
+        return bucket->bhs[block_off]->b_data + offs;
+}
+/*
+ * Handle the normal xattr set, including replace, delete and new.
+ *
+ * Note: "local" indicates the real data's locality. So we can't
+ * just its bucket locality by its length.
+ */
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+                                         struct ocfs2_xattr_info *xi,
+                                         struct ocfs2_xattr_search *xs,
+                                         u32 name_hash,
+                                         int local)
+{
+        struct ocfs2_xattr_entry *last, *xe;
+        int name_len = strlen(xi->name);
+        struct ocfs2_xattr_header *xh = xs->header;
+        u16 count = le16_to_cpu(xh->xh_count), start;
+        size_t blocksize = inode->i_sb->s_blocksize;
+        char *val;
+        size_t offs, size, new_size;
+        last = &xh->xh_entries[count];
+        if (!xs->not_found) {
+                xe = xs->here;
+                offs = le16_to_cpu(xe->xe_name_offset);
+                if (ocfs2_xattr_is_local(xe))
+                        size = OCFS2_XATTR_SIZE(name_len) +
+                        OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+                else
+                        size = OCFS2_XATTR_SIZE(name_len) +
+                        OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+                /*
+                 * If the new value will be stored outside, xi->value has been
+                 * initalized as an empty ocfs2_xattr_value_root, and the same
+                 * goes with xi->value_len, so we can set new_size safely here.
+                 * See ocfs2_xattr_set_in_bucket.
+                 */
+                new_size = OCFS2_XATTR_SIZE(name_len) +
+                           OCFS2_XATTR_SIZE(xi->value_len);
+                le16_add_cpu(&xh->xh_name_value_len, -size);
+                if (xi->value) {
+                        if (new_size > size)
+                                goto set_new_name_value;
+                        /* Now replace the old value with new one. */
+                        if (local)
+                                xe->xe_value_size = cpu_to_le64(xi->value_len);
+                        else
+                                xe->xe_value_size = 0;
+                        val = ocfs2_xattr_bucket_get_val(inode,
+                                                         &xs->bucket, offs);
+                        memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+                               size - OCFS2_XATTR_SIZE(name_len));
+                        if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+                                memcpy(val + OCFS2_XATTR_SIZE(name_len),
+                                       xi->value, xi->value_len);
+                        le16_add_cpu(&xh->xh_name_value_len, new_size);
+                        ocfs2_xattr_set_local(xe, local);
+                        return;
+                } else {
+                        /*
+                         * Remove the old entry if there is more than one.
+                         * We don't remove the last entry so that we can
+                         * use it to indicate the hash value of the empty
+                         * bucket.
+                         */
+                        last -= 1;
+                        le16_add_cpu(&xh->xh_count, -1);
+                        if (xh->xh_count) {
+                                memmove(xe, xe + 1,
+                                        (void *)last - (void *)xe);
+                                memset(last, 0,
+                                       sizeof(struct ocfs2_xattr_entry));
+                        } else
+                                xh->xh_free_start =
+                                        cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+                        return;
+                }
+        } else {
+                /* find a new entry for insert. */
+                int low = 0, high = count - 1, tmp;
+                struct ocfs2_xattr_entry *tmp_xe;
+                while (low <= high && count) {
+                        tmp = (low + high) / 2;
+                        tmp_xe = &xh->xh_entries[tmp];
+                        if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+                                low = tmp + 1;
+                        else if (name_hash <
+                                 le32_to_cpu(tmp_xe->xe_name_hash))
+                                high = tmp - 1;
+                        else {
+                                low = tmp;
+                                break;
+                        }
+                }
+                xe = &xh->xh_entries[low];
+                if (low != count)
+                        memmove(xe + 1, xe, (void *)last - (void *)xe);
+                le16_add_cpu(&xh->xh_count, 1);
+                memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+                xe->xe_name_hash = cpu_to_le32(name_hash);
+                xe->xe_name_len = name_len;
+                ocfs2_xattr_set_type(xe, xi->name_index);
+        }
+set_new_name_value:
+        /* Insert the new name+value. */
+        size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+        /*
+         * We must make sure that the name/value pair
+         * exists in the same block.
+         */
+        offs = le16_to_cpu(xh->xh_free_start);
+        start = offs - size;
+        if (start >> inode->i_sb->s_blocksize_bits !=
+            (offs - 1) >> inode->i_sb->s_blocksize_bits) {
+                offs = offs - offs % blocksize;
+                xh->xh_free_start = cpu_to_le16(offs);
+        }
+        val = ocfs2_xattr_bucket_get_val(inode,
+                                         &xs->bucket, offs - size);
+        xe->xe_name_offset = cpu_to_le16(offs - size);
+        memset(val, 0, size);
+        memcpy(val, xi->name, name_len);
+        memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+        xe->xe_value_size = cpu_to_le64(xi->value_len);
+        ocfs2_xattr_set_local(xe, local);
+        xs->here = xe;
+        le16_add_cpu(&xh->xh_free_start, -size);
+        le16_add_cpu(&xh->xh_name_value_len, size);
+        return;
+}
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+                                             handle_t *handle,
+                                             struct ocfs2_xattr_search *xs,
+                                             struct buffer_head **bhs,
+                                             u16 bh_num)
+{
+        int ret = 0, off, block_off;
+        struct ocfs2_xattr_entry *xe = xs->here;
+        /*
+         * First calculate all the blocks we should journal_access
+         * and journal_dirty. The first block should always be touched.
+         */
+        ret = ocfs2_journal_dirty(handle, bhs[0]);
+        if (ret)
+                mlog_errno(ret);
+        /* calc the data. */
+        off = le16_to_cpu(xe->xe_name_offset);
+        block_off = off >> inode->i_sb->s_blocksize_bits;
+        ret = ocfs2_journal_dirty(handle, bhs[block_off]);
+        if (ret)
+                mlog_errno(ret);
+        return ret;
+}
+/*
+ * Set the xattr entry in the specified bucket.
+ * The bucket is indicated by xs->bucket and it should have the enough
+ * space for the xattr insertion.
+ */
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+                                           struct ocfs2_xattr_info *xi,
+                                           struct ocfs2_xattr_search *xs,
+                                           u32 name_hash,
+                                           int local)
+{
+        int i, ret;
+        handle_t *handle = NULL;
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
+             (unsigned long)xi->value_len, xi->name_index,
+             (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+        if (!xs->bucket.bhs[1]) {
+                ret = ocfs2_read_blocks(inode,
+                                        xs->bucket.bhs[0]->b_blocknr + 1,
+                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                                        OCFS2_BH_CACHED);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, blk_per_bucket);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                handle = NULL;
+                mlog_errno(ret);
+                goto out;
+        }
+        for (i = 0; i < blk_per_bucket; i++) {
+                ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+        /*Only dirty the blocks we have touched in set xattr. */
+        ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+                                                xs->bucket.bhs, blk_per_bucket);
+        if (ret)
+                mlog_errno(ret);
+out:
+        ocfs2_commit_trans(osb, handle);
+        return ret;
+}
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+                                         struct buffer_head *xe_bh,
+                                         struct ocfs2_xattr_entry *xe,
+                                         u64 new_size)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = NULL;
+        handle = ocfs2_start_trans(osb, 1);
+        if (handle == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, xe_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret < 0) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        xe->xe_value_size = cpu_to_le64(new_size);
+        ret = ocfs2_journal_dirty(handle, xe_bh);
+        if (ret < 0)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        return ret;
+}
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+                                             struct buffer_head *header_bh,
+                                             int xe_off,
+                                             int len)
+{
+        int ret, offset;
+        u64 value_blk;
+        struct buffer_head *value_bh = NULL;
+        struct ocfs2_xattr_value_root *xv;
+        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_xattr_header *xh =
+                        (struct ocfs2_xattr_header *)header_bh->b_data;
+        size_t blocksize = inode->i_sb->s_blocksize;
+        xe = &xh->xh_entries[xe_off];
+        BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+        offset = le16_to_cpu(xe->xe_name_offset) +
+                 OCFS2_XATTR_SIZE(xe->xe_name_len);
+        value_blk = offset / blocksize;
+        /* We don't allow ocfs2_xattr_value to be stored in different block. */
+        BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+        value_blk += header_bh->b_blocknr;
+        ret = ocfs2_read_block(inode, value_blk, &value_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        xv = (struct ocfs2_xattr_value_root *)
+                (value_bh->b_data + offset % blocksize);
+        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+             xe_off, (unsigned long long)header_bh->b_blocknr, len);
+        ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+out:
+        brelse(value_bh);
+        return ret;
+}
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+                                                struct ocfs2_xattr_search *xs,
+                                                int len)
+{
+        int ret, offset;
+        struct ocfs2_xattr_entry *xe = xs->here;
+        struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+        BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+        offset = xe - xh->xh_entries;
+        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+                                                offset, len);
+        if (ret)
+                mlog_errno(ret);
+        return ret;
+}
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+                                                struct ocfs2_xattr_search *xs,
+                                                char *val,
+                                                int value_len)
+{
+        int offset;
+        struct ocfs2_xattr_value_root *xv;
+        struct ocfs2_xattr_entry *xe = xs->here;
+        BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
+        offset = le16_to_cpu(xe->xe_name_offset) +
+                 OCFS2_XATTR_SIZE(xe->xe_name_len);
+        xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+        return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+                                  struct buffer_head *root_bh,
+                                  u64 blkno,
+                                  u32 cpos,
+                                  u32 len)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_xattr_block *xb =
+                        (struct ocfs2_xattr_block *)root_bh->b_data;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+        ocfs2_init_dealloc_ctxt(&dealloc);
+        mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+             cpos, len, (unsigned long long)blkno);
+        ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        if (handle == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access(handle, inode, root_bh,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+                                  &dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+        ret = ocfs2_journal_dirty(handle, root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        ocfs2_run_deallocs(osb, &dealloc);
+        return ret;
+}
+static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+                                         struct ocfs2_xattr_search *xs)
+{
+        handle_t *handle = NULL;
+        struct ocfs2_xattr_header *xh = xs->bucket.xh;
+        struct ocfs2_xattr_entry *last = &xh->xh_entries[
+                                                le16_to_cpu(xh->xh_count) - 1];
+        int ret = 0;
+        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                return;
+        }
+        ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        /* Remove the old entry. */
+        memmove(xs->here, xs->here + 1,
+                (void *)last - (void *)xs->here);
+        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+        le16_add_cpu(&xh->xh_count, -1);
+        ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+        if (ret < 0)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+}
+/*
+ * Set the xattr name/value in the bucket specified in xs.
+ *
+ * As the new value in xi may be stored in the bucket or in an outside cluster,
+ * we divide the whole process into 3 steps:
+ * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
+ * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
+ * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
+ * 4. If the clusters for the new outside value can't be allocated, we need
+ *    to free the xattr we allocated in set.
+ */
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xs)
+{
+        int ret, local = 1;
+        size_t value_len;
+        char *val = (char *)xi->value;
+        struct ocfs2_xattr_entry *xe = xs->here;
+        u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
+                                              strlen(xi->name));
+        if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
+                /*
+                 * We need to truncate the xattr storage first.
+                 *
+                 * If both the old and new value are stored to
+                 * outside block, we only need to truncate
+                 * the storage and then set the value outside.
+                 *
+                 * If the new value should be stored within block,
+                 * we should free all the outside block first and
+                 * the modification to the xattr block will be done
+                 * by following steps.
+                 */
+                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                        value_len = xi->value_len;
+                else
+                        value_len = 0;
+                ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+                                                           value_len);
+                if (ret)
+                        goto out;
+                if (value_len)
+                        goto set_value_outside;
+        }
+        value_len = xi->value_len;
+        /* So we have to handle the inside block change now. */
+        if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+                /*
+                 * If the new value will be stored outside of block,
+                 * initalize a new empty value root and insert it first.
+                 */
+                local = 0;
+                xi->value = &def_xv;
+                xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+        }
+        ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+                goto out;
+        /* allocate the space now for the outside block storage. */
+        ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+                                                   value_len);
+        if (ret) {
+                mlog_errno(ret);
+                if (xs->not_found) {
+                        /*
+                         * We can't allocate enough clusters for outside
+                         * storage and we have allocated xattr already,
+                         * so need to remove it.
+                         */
+                        ocfs2_xattr_bucket_remove_xs(inode, xs);
+                }
+                goto out;
+        }
+set_value_outside:
+        ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+        return ret;
+}
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+                                              struct ocfs2_xattr_bucket *bucket)
+{
+        struct ocfs2_xattr_header *xh = bucket->xh;
+        if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+            xh->xh_entries[0].xe_name_hash) {
+                mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+                     "hash = %u\n",
+                     (unsigned long long)bucket->bhs[0]->b_blocknr,
+                     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+                return -ENOSPC;
+        }
+        return 0;
+}
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+                                             struct ocfs2_xattr_info *xi,
+                                             struct ocfs2_xattr_search *xs)
+{
+        struct ocfs2_xattr_header *xh;
+        struct ocfs2_xattr_entry *xe;
+        u16 count, header_size, xh_free_start;
+        int i, free, max_free, need, old;
+        size_t value_size = 0, name_len = strlen(xi->name);
+        size_t blocksize = inode->i_sb->s_blocksize;
+        int ret, allocation = 0;
+        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+try_again:
+        xh = xs->header;
+        count = le16_to_cpu(xh->xh_count);
+        xh_free_start = le16_to_cpu(xh->xh_free_start);
+        header_size = sizeof(struct ocfs2_xattr_header) +
+                        count * sizeof(struct ocfs2_xattr_entry);
+        max_free = OCFS2_XATTR_BUCKET_SIZE -
+                le16_to_cpu(xh->xh_name_value_len) - header_size;
+        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
+                        "of %u which exceed block size\n",
+                        (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+                        header_size);
+        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                value_size = OCFS2_XATTR_ROOT_SIZE;
+        else if (xi->value)
+                value_size = OCFS2_XATTR_SIZE(xi->value_len);
+        if (xs->not_found)
+                need = sizeof(struct ocfs2_xattr_entry) +
+                        OCFS2_XATTR_SIZE(name_len) + value_size;
+        else {
+                need = value_size + OCFS2_XATTR_SIZE(name_len);
+                /*
+                 * We only replace the old value if the new length is smaller
+                 * than the old one. Otherwise we will allocate new space in the
+                 * bucket to store it.
+                 */
+                xe = xs->here;
+                if (ocfs2_xattr_is_local(xe))
+                        old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+                else
+                        old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+                if (old >= value_size)
+                        need = 0;
+        }
+        free = xh_free_start - header_size;
+        /*
+         * We need to make sure the new name/value pair
+         * can exist in the same block.
+         */
+        if (xh_free_start % blocksize < need)
+                free -= xh_free_start % blocksize;
+        mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+             "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
+             " %u\n", xs->not_found,
+             (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+             free, need, max_free, le16_to_cpu(xh->xh_free_start),
+             le16_to_cpu(xh->xh_name_value_len));
+        if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+                if (need <= max_free &&
+                    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+                        /*
+                         * We can create the space by defragment. Since only the
+                         * name/value will be moved, the xe shouldn't be changed
+                         * in xs.
+                         */
+                        ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        xh_free_start = le16_to_cpu(xh->xh_free_start);
+                        free = xh_free_start - header_size;
+                        if (xh_free_start % blocksize < need)
+                                free -= xh_free_start % blocksize;
+                        if (free >= need)
+                                goto xattr_set;
+                        mlog(0, "Can't get enough space for xattr insert by "
+                             "defragment. Need %u bytes, but we have %d, so "
+                             "allocate new bucket for it.\n", need, free);
+                }
+                /*
+                 * We have to add new buckets or clusters and one
+                 * allocation should leave us enough space for insert.
+                 */
+                BUG_ON(allocation);
+                /*
+                 * We do not allow for overlapping ranges between buckets. And
+                 * the maximum number of collisions we will allow for then is
+                 * one bucket's worth, so check it here whether we need to
+                 * add a new bucket for the insert.
+                 */
+                ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_add_new_xattr_bucket(inode,
+                                                 xs->xattr_bh,
+                                                 xs->bucket.bhs[0]);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                for (i = 0; i < blk_per_bucket; i++)
+                        brelse(xs->bucket.bhs[i]);
+                memset(&xs->bucket, 0, sizeof(xs->bucket));
+                ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+                                                   xi->name_index,
+                                                   xi->name, xs);
+                if (ret && ret != -ENODATA)
+                        goto out;
+                xs->not_found = ret;
+                allocation = 1;
+                goto try_again;
+        }
+xattr_set:
+        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+        mlog_exit(ret);
+        return ret;
+}
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+                                        struct ocfs2_xattr_bucket *bucket,
+                                        void *para)
+{
+        int ret = 0;
+        struct ocfs2_xattr_header *xh = bucket->xh;
+        u16 i;
+        struct ocfs2_xattr_entry *xe;
+        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+                xe = &xh->xh_entries[i];
+                if (ocfs2_xattr_is_local(xe))
+                        continue;
+                ret = ocfs2_xattr_bucket_value_truncate(inode,
+                                                        bucket->bhs[0],
+                                                        i, 0);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        return ret;
+}
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+                                          struct buffer_head *xb_bh)
+{
+        struct ocfs2_xattr_block *xb =
+                        (struct ocfs2_xattr_block *)xb_bh->b_data;
+        struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+        int ret = 0;
+        u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+        u64 p_blkno;
+        if (le16_to_cpu(el->l_next_free_rec) == 0)
+                return 0;
+        while (name_hash > 0) {
+                ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+                                          &e_cpos, &num_clusters, el);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+                                                  ocfs2_delete_xattr_in_bucket,
+                                                  NULL);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+                                             p_blkno, e_cpos, num_clusters);
+                if (ret) {
+                        mlog_errno(ret);
+                        break;
+                }
+                if (e_cpos == 0)
+                        break;
+                name_hash = e_cpos - 1;
+        }
+out:
+        return ret;
+}
+/*
+ * 'trusted' attributes support
+ */
+#define XATTR_TRUSTED_PREFIX "trusted."
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+                                       size_t list_size, const char *name,
+                                       size_t name_len)
+{
+        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+                                   void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+                               buffer, size);
+}
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+                                   const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+                               size, flags);
+}
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list   = ocfs2_xattr_trusted_list,
+        .get    = ocfs2_xattr_trusted_get,
+        .set    = ocfs2_xattr_trusted_set,
+};
+/*
+ * 'user' attributes support
+ */
+#define XATTR_USER_PREFIX "user."
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+                                    size_t list_size, const char *name,
+                                    size_t name_len)
+{
+        const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+        const size_t total_len = prefix_len + name_len + 1;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+                return 0;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_USER_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+                                void *buffer, size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+                return -EOPNOTSUPP;
+        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+                               buffer, size);
+}
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+                                const void *value, size_t size, int flags)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+                return -EOPNOTSUPP;
+        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+                               size, flags);
+}
+struct xattr_handler ocfs2_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list   = ocfs2_xattr_user_list,
+        .get    = ocfs2_xattr_user_get,
+        .set    = ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 000000000000..c25c7c62a059
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+#include <linux/init.h>
+#include <linux/xattr.h>
+enum ocfs2_xattr_type {
+        OCFS2_XATTR_INDEX_USER = 1,
+        OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+        OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+        OCFS2_XATTR_INDEX_TRUSTED,
+        OCFS2_XATTR_INDEX_SECURITY,
+        OCFS2_XATTR_MAX
+};
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+                           size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+        return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+        return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+        u16 len = sb->s_blocksize -
+                 offsetof(struct ocfs2_xattr_header, xh_entries);
+        return len / sizeof(struct ocfs2_xattr_entry);
+}
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index 697663b01bae..e1c0ec0ae989 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -92,7 +92,7 @@ int omfs_allocate_block(struct super_block *sb, u64 block)
        struct buffer_head *bh;
        struct omfs_sb_info *sbi = OMFS_SB(sb);
        int bits_per_entry = 8 * sb->s_blocksize;
-        int map, bit;
+        unsigned int map, bit;
        int ret = 0;
        u64 tmp;
@@ -176,7 +176,8 @@ int omfs_clear_range(struct super_block *sb, u64 block, int count)
        struct omfs_sb_info *sbi = OMFS_SB(sb);
        int bits_per_entry = 8 * sb->s_blocksize;
        u64 tmp;
-        int map, bit, ret;
+        unsigned int map, bit;
+        int ret;
        tmp = block;
        bit = do_div(tmp, bits_per_entry);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 7e2499053e4d..834b2331f6b3 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -26,6 +26,13 @@ static int omfs_sync_file(struct file *file, struct dentry *dentry,
        return err ? -EIO : 0;
 }
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+        return (sbi->s_sys_blocksize - offset -
+                sizeof(struct omfs_extent)) /
+                sizeof(struct omfs_extent_entry) + 1;
+}
 void omfs_make_empty_table(struct buffer_head *bh, int offset)
 {
        struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
@@ -45,6 +52,7 @@ int omfs_shrink_inode(struct inode *inode)
        struct buffer_head *bh;
        u64 next, last;
        u32 extent_count;
+        u32 max_extents;
        int ret;
        /* traverse extent table, freeing each entry that is greater
@@ -62,15 +70,18 @@ int omfs_shrink_inode(struct inode *inode)
                goto out;
        oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+        max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
        for (;;) {
-                if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) {
+                if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
-                        brelse(bh);
+                        goto out_brelse;
-                        goto out;
-                }
                extent_count = be32_to_cpu(oe->e_extent_count);
+                if (extent_count > max_extents)
+                        goto out_brelse;
                last = next;
                next = be64_to_cpu(oe->e_next);
                entry = &oe->e_entry;
@@ -98,10 +109,14 @@ int omfs_shrink_inode(struct inode *inode)
                if (!bh)
                        goto out;
                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+                max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
        }
        ret = 0;
 out:
        return ret;
+out_brelse:
+        brelse(bh);
+        return ret;
 }
 static void omfs_truncate(struct inode *inode)
@@ -154,9 +169,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
                        goto out;
                }
        }
-        max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START -
+        max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
-                sizeof(struct omfs_extent)) /
-                sizeof(struct omfs_extent_entry) + 1;
        /* TODO: add a continuation block here */
        if (be32_to_cpu(oe->e_extent_count) > max_count-1)
@@ -225,6 +238,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
        sector_t next, offset;
        int ret;
        u64 new_block;
+        u32 max_extents;
        int extent_count;
        struct omfs_extent *oe;
        struct omfs_extent_entry *entry;
@@ -238,6 +252,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                goto out;
        oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+        max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
        next = inode->i_ino;
        for (;;) {
@@ -249,6 +264,9 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                next = be64_to_cpu(oe->e_next);
                entry = &oe->e_entry;
+                if (extent_count > max_extents)
+                        goto out_brelse;
                offset = find_block(inode, entry, block, extent_count, &remain);
                if (offset > 0) {
                        ret = 0;
@@ -266,6 +284,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
                if (!bh)
                        goto out;
                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+                max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
        }
        if (create) {
                ret = omfs_grow_extent(inode, oe, &new_block);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index a95fe5984f4b..cbf047a847c5 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -232,8 +232,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
                inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
                inode->i_op = &omfs_dir_inops;
                inode->i_fop = &omfs_dir_operations;
-                inode->i_size = be32_to_cpu(oi->i_head.h_body_size) +
+                inode->i_size = sbi->s_sys_blocksize;
-                        sizeof(struct omfs_header);
                inc_nlink(inode);
                break;
        case OMFS_FILE:
@@ -347,7 +346,7 @@ enum {
        Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_umask, "umask=%o"},
diff --git a/fs/open.c b/fs/open.c
index 07da9359481c..5596049863bf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1141,8 +1141,7 @@ EXPORT_SYMBOL(sys_close);
 asmlinkage long sys_vhangup(void)
 {
        if (capable(CAP_SYS_TTY_CONFIG)) {
-                /* XXX: this needs locking */
+                tty_vhangup_self();
-                tty_vhangup(current->signal->tty);
                return 0;
        }
        return -EPERM;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7d6b34e201db..7408227c49c9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
 * a pointer to that same buffer (for convenience).
 */
-char *disk_name(struct gendisk *hd, int part, char *buf)
+char *disk_name(struct gendisk *hd, int partno, char *buf)
 {
-        if (!part)
+        if (!partno)
                snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
        else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-                snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part);
+                snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
        else
-                snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part);
+                snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
        return buf;
 }
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-        int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
+        return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
-        return disk_name(bdev->bd_disk, part, buf);
 }
 EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        if (isdigit(state->name[strlen(state->name)-1]))
                sprintf(state->name, "p");
-        state->limit = hd->minors;
+        state->limit = disk_max_parts(hd);
        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(&state->parts, 0, sizeof(state->parts));
@@ -204,21 +203,22 @@ static ssize_t part_start_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
 }
-static ssize_t part_size_show(struct device *dev,
+ssize_t part_size_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+                       struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
-static ssize_t part_stat_show(struct device *dev,
+ssize_t part_stat_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+                       struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
+        int cpu;
-        preempt_disable();
+        cpu = part_stat_lock();
-        part_round_stats(p);
+        part_round_stats(cpu, p);
-        preempt_enable();
+        part_stat_unlock();
        return sprintf(buf,
                "%8lu %8lu %8llu %8u "
                "%8lu %8lu %8llu %8u "
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t part_fail_show(struct device *dev,
+ssize_t part_fail_show(struct device *dev,
-                              struct device_attribute *attr, char *buf)
+                       struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
        return sprintf(buf, "%d\n", p->make_it_fail);
 }
-static ssize_t part_fail_store(struct device *dev,
+ssize_t part_fail_store(struct device *dev,
-                               struct device_attribute *attr,
+                        struct device_attribute *attr,
-                               const char *buf, size_t count)
+                        const char *buf, size_t count)
 {
        struct hd_struct *p = dev_to_part(dev);
        int i;
@@ -300,40 +300,34 @@ struct device_type part_type = {
        .release        = part_release,
 };
-static inline void partition_sysfs_add_subdir(struct hd_struct *p)
+static void delete_partition_rcu_cb(struct rcu_head *head)
-{
-        struct kobject *k;
-        k = kobject_get(&p->dev.kobj);
-        p->holder_dir = kobject_create_and_add("holders", k);
-        kobject_put(k);
-}
-static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
-        struct kobject *k;
+        struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
-        k = kobject_get(&disk->dev.kobj);
+        part->start_sect = 0;
-        disk->holder_dir = kobject_create_and_add("holders", k);
+        part->nr_sects = 0;
-        disk->slave_dir = kobject_create_and_add("slaves", k);
+        part_stat_set_all(part, 0);
-        kobject_put(k);
+        put_device(part_to_dev(part));
 }
-void delete_partition(struct gendisk *disk, int part)
+void delete_partition(struct gendisk *disk, int partno)
 {
-        struct hd_struct *p = disk->part[part-1];
+        struct disk_part_tbl *ptbl = disk->part_tbl;
+        struct hd_struct *part;
-        if (!p)
+        if (partno >= ptbl->len)
                return;
-        if (!p->nr_sects)
+        part = ptbl->part[partno];
+        if (!part)
                return;
-        disk->part[part-1] = NULL;
-        p->start_sect = 0;
+        blk_free_devt(part_devt(part));
-        p->nr_sects = 0;
+        rcu_assign_pointer(ptbl->part[partno], NULL);
-        part_stat_set_all(p, 0);
+        kobject_put(part->holder_dir);
-        kobject_put(p->holder_dir);
+        device_del(part_to_dev(part));
-        device_del(&p->dev);
-        put_device(&p->dev);
+        call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 static ssize_t whole_disk_show(struct device *dev,
@@ -344,102 +338,132 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
                   whole_disk_show, NULL);
-int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int partno,
+                  sector_t start, sector_t len, int flags)
 {
        struct hd_struct *p;
+        dev_t devt = MKDEV(0, 0);
+        struct device *ddev = disk_to_dev(disk);
+        struct device *pdev;
+        struct disk_part_tbl *ptbl;
+        const char *dname;
        int err;
+        err = disk_expand_part_tbl(disk, partno);
+        if (err)
+                return err;
+        ptbl = disk->part_tbl;
+        if (ptbl->part[partno])
+                return -EBUSY;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
                return -ENOMEM;
        if (!init_part_stats(p)) {
                err = -ENOMEM;
-                goto out0;
+                goto out_free;
        }
+        pdev = part_to_dev(p);
        p->start_sect = start;
        p->nr_sects = len;
-        p->partno = part;
+        p->partno = partno;
-        p->policy = disk->policy;
+        p->policy = get_disk_ro(disk);
-        if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
+        dname = dev_name(ddev);
-                snprintf(p->dev.bus_id, BUS_ID_SIZE,
+        if (isdigit(dname[strlen(dname) - 1]))
-                "%sp%d", disk->dev.bus_id, part);
+                snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
        else
-                snprintf(p->dev.bus_id, BUS_ID_SIZE,
+                snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
-                         "%s%d", disk->dev.bus_id, part);
+        device_initialize(pdev);
+        pdev->class = &block_class;
+        pdev->type = &part_type;
+        pdev->parent = ddev;
-        device_initialize(&p->dev);
+        err = blk_alloc_devt(p, &devt);
-        p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
+        if (err)
-        p->dev.class = &block_class;
+                goto out_free;
-        p->dev.type = &part_type;
+        pdev->devt = devt;
-        p->dev.parent = &disk->dev;
-        disk->part[part-1] = p;
        /* delay uevent until 'holders' subdir is created */
-        p->dev.uevent_suppress = 1;
+        pdev->uevent_suppress = 1;
-        err = device_add(&p->dev);
+        err = device_add(pdev);
        if (err)
-                goto out1;
+                goto out_put;
-        partition_sysfs_add_subdir(p);
-        p->dev.uevent_suppress = 0;
+        err = -ENOMEM;
+        p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+        if (!p->holder_dir)
+                goto out_del;
+        pdev->uevent_suppress = 0;
        if (flags & ADDPART_FLAG_WHOLEDISK) {
-                err = device_create_file(&p->dev, &dev_attr_whole_disk);
+                err = device_create_file(pdev, &dev_attr_whole_disk);
                if (err)
-                        goto out2;
+                        goto out_del;
        }
+        /* everything is up and running, commence */
+        INIT_RCU_HEAD(&p->rcu_head);
+        rcu_assign_pointer(ptbl->part[partno], p);
        /* suppress uevent if the disk supresses it */
-        if (!disk->dev.uevent_suppress)
+        if (!ddev->uevent_suppress)
-                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+                kobject_uevent(&pdev->kobj, KOBJ_ADD);
        return 0;
-out2:
+out_free:
-        device_del(&p->dev);
-out1:
-        put_device(&p->dev);
-        free_part_stats(p);
-out0:
        kfree(p);
        return err;
+out_del:
+        kobject_put(p->holder_dir);
+        device_del(pdev);
+out_put:
+        put_device(pdev);
+        blk_free_devt(devt);
+        return err;
 }
 /* Not exported, helper to add_disk(). */
 void register_disk(struct gendisk *disk)
 {
+        struct device *ddev = disk_to_dev(disk);
        struct block_device *bdev;
+        struct disk_part_iter piter;
+        struct hd_struct *part;
        char *s;
-        int i;
-        struct hd_struct *p;
        int err;
-        disk->dev.parent = disk->driverfs_dev;
+        ddev->parent = disk->driverfs_dev;
-        disk->dev.devt = MKDEV(disk->major, disk->first_minor);
-        strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
+        strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
        /* ewww... some of these buggers have / in the name... */
-        s = strchr(disk->dev.bus_id, '/');
+        s = strchr(ddev->bus_id, '/');
        if (s)
                *s = '!';
        /* delay uevents, until we scanned partition table */
-        disk->dev.uevent_suppress = 1;
+        ddev->uevent_suppress = 1;
-        if (device_add(&disk->dev))
+        if (device_add(ddev))
                return;
 #ifndef CONFIG_SYSFS_DEPRECATED
-        err = sysfs_create_link(block_depr, &disk->dev.kobj,
+        err = sysfs_create_link(block_depr, &ddev->kobj,
-                                kobject_name(&disk->dev.kobj));
+                                kobject_name(&ddev->kobj));
        if (err) {
-                device_del(&disk->dev);
+                device_del(ddev);
                return;
        }
 #endif
-        disk_sysfs_add_subdirs(disk);
+        disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
        /* No minors to use for partitions */
-        if (disk->minors == 1)
+        if (!disk_partitionable(disk))
                goto exit;
        /* No such device (e.g., media were just removed) */
@@ -458,50 +482,66 @@ void register_disk(struct gendisk *disk)
 exit:
        /* announce disk after possible partitions are created */
-        disk->dev.uevent_suppress = 0;
+        ddev->uevent_suppress = 0;
-        kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
+        kobject_uevent(&ddev->kobj, KOBJ_ADD);
        /* announce possible partitions */
-        for (i = 1; i < disk->minors; i++) {
+        disk_part_iter_init(&piter, disk, 0);
-                p = disk->part[i-1];
+        while ((part = disk_part_iter_next(&piter)))
-                if (!p || !p->nr_sects)
+                kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
-                        continue;
+        disk_part_iter_exit(&piter);
-                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
-        }
 }
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+        struct disk_part_iter piter;
+        struct hd_struct *part;
        struct parsed_partitions *state;
-        int p, res;
+        int p, highest, res;
        if (bdev->bd_part_count)
                return -EBUSY;
        res = invalidate_partition(disk, 0);
        if (res)
                return res;
-        bdev->bd_invalidated = 0;
-        for (p = 1; p < disk->minors; p++)
+        disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
-                delete_partition(disk, p);
+        while ((part = disk_part_iter_next(&piter)))
+                delete_partition(disk, part->partno);
+        disk_part_iter_exit(&piter);
        if (disk->fops->revalidate_disk)
                disk->fops->revalidate_disk(disk);
+        check_disk_size_change(disk, bdev);
+        bdev->bd_invalidated = 0;
        if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
                return 0;
        if (IS_ERR(state))      /* I/O error reading the partition table */
                return -EIO;
        /* tell userspace that the media / partition table may have changed */
-        kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+        /* Detect the highest partition number and preallocate
+         * disk->part_tbl.  This is an optimization and not strictly
+         * necessary.
+         */
+        for (p = 1, highest = 0; p < state->limit; p++)
+                if (state->parts[p].size)
+                        highest = p;
+        disk_expand_part_tbl(disk, highest);
+        /* add partitions */
        for (p = 1; p < state->limit; p++) {
                sector_t size = state->parts[p].size;
                sector_t from = state->parts[p].from;
                if (!size)
                        continue;
                if (from + size > get_capacity(disk)) {
-                        printk(KERN_ERR " %s: p%d exceeds device capacity\n",
+                        printk(KERN_WARNING
+                                "%s: p%d exceeds device capacity\n",
                                disk->disk_name, p);
-                        continue;
                }
                res = add_partition(disk, p, from, size, state->parts[p].flags);
                if (res) {
@@ -541,25 +581,31 @@ EXPORT_SYMBOL(read_dev_sector);
 void del_gendisk(struct gendisk *disk)
 {
-        int p;
+        struct disk_part_iter piter;
+        struct hd_struct *part;
        /* invalidate stuff */
-        for (p = disk->minors - 1; p > 0; p--) {
+        disk_part_iter_init(&piter, disk,
-                invalidate_partition(disk, p);
+                             DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
-                delete_partition(disk, p);
+        while ((part = disk_part_iter_next(&piter))) {
+                invalidate_partition(disk, part->partno);
+                delete_partition(disk, part->partno);
        }
+        disk_part_iter_exit(&piter);
        invalidate_partition(disk, 0);
-        disk->capacity = 0;
+        blk_free_devt(disk_to_dev(disk)->devt);
+        set_capacity(disk, 0);
        disk->flags &= ~GENHD_FL_UP;
        unlink_gendisk(disk);
-        disk_stat_set_all(disk, 0);
+        part_stat_set_all(&disk->part0, 0);
-        disk->stamp = 0;
+        disk->part0.stamp = 0;
-        kobject_put(disk->holder_dir);
+        kobject_put(disk->part0.holder_dir);
        kobject_put(disk->slave_dir);
        disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
-        sysfs_remove_link(block_depr, disk->dev.bus_id);
+        sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 #endif
-        device_del(&disk->dev);
+        device_del(disk_to_dev(disk));
 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
 * add_gd_partition adds a partitions details to the devices partition
 * description.
 */
-enum { MAX_PART = 256 };
 struct parsed_partitions {
        char name[BDEVNAME_SIZE];
        struct {
                sector_t from;
                sector_t size;
                int flags;
-        } parts[MAX_PART];
+        } parts[DISK_MAX_PARTS];
        int next;
        int limit;
 };
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 73cd7a418f06..50f8f0600f06 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -57,3 +57,13 @@ config PROC_SYSCTL
          As it is generally a good thing, you should say Y here unless
          building a kernel for install/rescue disks or your system is very
          limited in memory.
+config PROC_PAGE_MONITOR
+        default y
+        depends on PROC_FS && MMU
+        bool "Enable /proc page monitoring" if EMBEDDED
+        help
+          Various /proc files exist to monitor process memory utilization:
+          /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+          /proc/kpagecount, and /proc/kpageflags. Disabling these
+          interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0d6eb33597c6..f4bc0e789539 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -86,11 +86,6 @@
 #include <asm/processor.h>
 #include "internal.h"
-/* Gcc optimizes away "strlen(x)" for constant x */
-#define ADDBUF(buffer, string) \
-do { memcpy(buffer, string, strlen(string)); \
-     buffer += strlen(string); } while (0)
 static inline void task_name(struct seq_file *m, struct task_struct *p)
 {
        int i;
@@ -261,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
        sigemptyset(&ignored);
        sigemptyset(&caught);
-        rcu_read_lock();
        if (lock_task_sighand(p, &flags)) {
                pending = p->pending.signal;
                shpending = p->signal->shared_pending.signal;
@@ -272,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
                unlock_task_sighand(p, &flags);
        }
-        rcu_read_unlock();
        seq_printf(m, "Threads:\t%d\n", num_threads);
        seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@ -337,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        return 0;
 }
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
-        return p->utime;
-}
-static cputime_t task_stime(struct task_struct *p)
-{
-        return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
-        clock_t utime = cputime_to_clock_t(p->utime),
-                total = utime + cputime_to_clock_t(p->stime);
-        u64 temp;
-        /*
-         * Use CFS's precise accounting:
-         */
-        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-        if (total) {
-                temp *= utime;
-                do_div(temp, total);
-        }
-        utime = (clock_t)temp;
-        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-        return p->prev_utime;
-}
-static cputime_t task_stime(struct task_struct *p)
-{
-        clock_t stime;
-        /*
-         * Use CFS's precise accounting. (we subtract utime from
-         * the total, to make sure the total observed by userspace
-         * grows monotonically - apps rely on that):
-         */
-        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-                        cputime_to_clock_t(task_utime(p));
-        if (stime >= 0)
-                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-        return p->prev_stime;
-}
-#endif
-static cputime_t task_gtime(struct task_struct *p)
-{
-        return p->gtime;
-}
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task, int whole)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a28840b11b89..b5918ae8ca79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -148,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
        return count;
 }
-int maps_protect;
-EXPORT_SYMBOL(maps_protect);
 static struct fs_struct *get_fs_struct(struct task_struct *task)
 {
        struct fs_struct *fs;
@@ -164,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
 static int get_nr_threads(struct task_struct *tsk)
 {
-        /* Must be called with the rcu_read_lock held */
        unsigned long flags;
        int count = 0;
@@ -471,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
        struct rlimit rlim[RLIM_NLIMITS];
-        rcu_read_lock();
+        if (!lock_task_sighand(task, &flags))
-        if (!lock_task_sighand(task,&flags)) {
-                rcu_read_unlock();
                return 0;
-        }
        memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
        unlock_task_sighand(task, &flags);
-        rcu_read_unlock();
        /*
         * print the file header
@@ -2443,6 +2435,13 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
+static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+                                struct pid *pid, struct task_struct *task)
+{
+        seq_printf(m, "%08x\n", task->personality);
+        return 0;
+}
 /*
 * Thread groups
 */
@@ -2459,6 +2458,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("environ",    S_IRUSR, environ),
        INF("auxv",       S_IRUSR, pid_auxv),
        ONE("status",     S_IRUGO, pid_status),
+        ONE("personality", S_IRUSR, pid_personality),
        INF("limits",     S_IRUSR, pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
@@ -2794,6 +2794,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("environ",   S_IRUSR, environ),
        INF("auxv",      S_IRUSR, pid_auxv),
        ONE("status",    S_IRUGO, pid_status),
+        ONE("personality", S_IRUSR, pid_personality),
        INF("limits",    S_IRUSR, pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
@@ -3088,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
        generic_fillattr(inode, stat);
        if (p) {
-                rcu_read_lock();
                stat->nlink += get_nr_threads(p);
-                rcu_read_unlock();
                put_task_struct(p);
        }
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 4fb81e9c94e3..7821589a17d5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -330,6 +330,7 @@ retry:
                spin_lock(&proc_inum_lock);
                ida_remove(&proc_inum_ida, i);
                spin_unlock(&proc_inum_lock);
+                return 0;
        }
        return PROC_DYNAMIC_FIRST + i;
 }
@@ -546,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        for (tmp = dir->subdir; tmp; tmp = tmp->next)
                if (strcmp(tmp->name, dp->name) == 0) {
-                        printk(KERN_WARNING "proc_dir_entry '%s' already "
+                        printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
-                                        "registered\n", dp->name);
+                                dir->name, dp->name);
                        dump_stack();
                        break;
                }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8bb03f056c28..c6b4fa7e3b49 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -342,7 +342,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
        if (!pde->proc_fops) {
                spin_unlock(&pde->pde_unload_lock);
                kfree(pdeo);
-                return rv;
+                return -EINVAL;
        }
        pde->pde_users++;
        open = pde->proc_fops->open;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 442202314d53..3bfb7b8747b3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -45,8 +45,6 @@ do {						\
 extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
-extern int maps_protect;
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task);
 extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 79ecd281d2cb..3f87d2632947 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
        }
        seq_printf(m,
-                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                   vma->vm_start,
                   vma->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   vma->vm_pgoff << PAGE_SHIFT,
+                   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded969862960..b675a49c1823 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
+#include <linux/quicklist.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
@@ -67,7 +68,6 @@
 extern int get_hardware_list(char *);
 extern int get_stram_list(char *);
 extern int get_exec_domain_list(char *);
-extern int get_dma_list(char *);
 static int proc_calc_metrics(char *page, char **start, off_t off,
                                 int count, int *eof, int len)
@@ -182,6 +182,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                "SReclaimable: %8lu kB\n"
                "SUnreclaim:   %8lu kB\n"
                "PageTables:   %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+                "Quicklists:   %8lu kB\n"
+#endif
                "NFS_Unstable: %8lu kB\n"
                "Bounce:       %8lu kB\n"
                "WritebackTmp: %8lu kB\n"
@@ -214,6 +217,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                K(global_page_state(NR_SLAB_RECLAIMABLE)),
                K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+                K(quicklist_total_size()),
+#endif
                K(global_page_state(NR_UNSTABLE_NFS)),
                K(global_page_state(NR_BOUNCE)),
                K(global_page_state(NR_WRITEBACK_TEMP)),
@@ -677,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
        return proc_calc_metrics(page, start, off, count, eof, len);
 }
+#ifdef CONFIG_FILE_LOCKING
 static int locks_open(struct inode *inode, struct file *filp)
 {
        return seq_open(filp, &locks_seq_operations);
@@ -688,6 +695,7 @@ static const struct file_operations proc_locks_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release,
 };
+#endif /* CONFIG_FILE_LOCKING */
 static int execdomains_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
@@ -881,7 +889,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_PRINTK
        proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
 #endif
+#ifdef CONFIG_FILE_LOCKING
        proc_create("locks", 0, NULL, &proc_locks_operations);
+#endif
        proc_create("devices", 0, NULL, &proc_devinfo_operations);
        proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f9a8b892718f..945a81043ba2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -66,7 +66,7 @@ static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
        return NULL;
 }
-struct ctl_table_header *grab_header(struct inode *inode)
+static struct ctl_table_header *grab_header(struct inode *inode)
 {
        if (PROC_I(inode)->sysctl)
                return sysctl_head_grab(PROC_I(inode)->sysctl);
@@ -395,10 +395,10 @@ static struct dentry_operations proc_sys_dentry_operations = {
        .d_compare      = proc_sys_compare,
 };
-static struct proc_dir_entry *proc_sys_root;
 int proc_sys_init(void)
 {
+        struct proc_dir_entry *proc_sys_root;
        proc_sys_root = proc_mkdir("sys", NULL);
        proc_sys_root->proc_iops = &proc_sys_dir_operations;
        proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7546a918f790..4806830ea2a1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,23 +210,20 @@ static int show_map(struct seq_file *m, void *v)
        dev_t dev = 0;
        int len;
-        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
-                return -EACCES;
        if (file) {
                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
        }
-        seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                        vma->vm_start,
                        vma->vm_end,
                        flags & VM_READ ? 'r' : '-',
                        flags & VM_WRITE ? 'w' : '-',
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
-                        vma->vm_pgoff << PAGE_SHIFT,
+                        ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
                        MAJOR(dev), MINOR(dev), ino, &len);
        /*
@@ -742,22 +739,11 @@ const struct file_operations proc_pagemap_operations = {
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
-static int show_numa_map_checked(struct seq_file *m, void *v)
-{
-        struct proc_maps_private *priv = m->private;
-        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
-                return -EACCES;
-        return show_numa_map(m, v);
-}
 static const struct seq_operations proc_pid_numa_maps_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
-        .show   = show_numa_map_checked
+        .show   = show_numa_map,
 };
 static int numa_maps_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d84e7121df8..219bd79ea894 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 static int show_map(struct seq_file *m, void *_vml)
 {
        struct vm_list_struct *vml = _vml;
-        struct proc_maps_private *priv = m->private;
-        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
-                return -EACCES;
        return nommu_vma_show(m, vml->vma);
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9ac0f5e064e0..841368b87a29 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
        return acc;
 }
-static int open_vmcore(struct inode *inode, struct file *filp)
-{
-        return 0;
-}
 const struct file_operations proc_vmcore_operations = {
        .read           = read_vmcore,
-        .open           = open_vmcore,
 };
 static struct vmcore* __init get_new_element(void)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff4..5145cb9125af 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
 * size 0 on the assumption that it's going to be used for an mmap of shared
 * memory
 */
-static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
        struct pagevec lru_pvec;
        unsigned long npages, xpages, loop, limit;
diff --git a/fs/readdir.c b/fs/readdir.c
index 4e026e5407fb..93a7559bbfd8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset
        if (buf->result)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->result = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        buf->result++;
        dirent = buf->dirent;
        if (!access_ok(VERIFY_WRITE, dirent,
@@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
        if (reclen > buf->count)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->error = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        dirent = buf->previous;
        if (dirent) {
                if (__put_user(offset, &dirent->d_off))
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 282a13596c70..d318c7e663fa 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -27,7 +27,6 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/quotaops.h>
 struct file_system_type reiserfs_fs_type;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3f54dbd6c49b..bd20f7f5a933 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        goto Done;
        }
        /* we need at least one record in buffer */
+        pos = m->index;
+        p = m->op->start(m, &pos);
        while (1) {
-                pos = m->index;
-                p = m->op->start(m, &pos);
                err = PTR_ERR(p);
                if (!p || IS_ERR(p))
                        break;
@@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        break;
                if (unlikely(err))
                        m->count = 0;
+                if (unlikely(!m->count)) {
+                        p = m->op->next(m, p, &pos);
+                        m->index = pos;
+                        continue;
+                }
                if (m->count < m->size)
                        goto Fill;
                m->op->stop(m, p);
@@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        goto Enomem;
                m->count = 0;
                m->version = 0;
+                pos = m->index;
+                p = m->op->start(m, &pos);
        }
        m->op->stop(m, p);
        m->count = 0;
@@ -443,6 +450,20 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
        return -1;
 }
+int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+{
+        size_t len = bitmap_scnprintf_len(nr_bits);
+        if (m->count + len < m->size) {
+                bitmap_scnprintf(m->buf + m->count, m->size - m->count,
+                                 bits, nr_bits);
+                m->count += len;
+                return 0;
+        }
+        m->count = m->size;
+        return -1;
+}
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
        return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 1bbc6f4bb09c..a1e701c27156 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;
+        if (unlikely(out->f_flags & O_APPEND))
+                return -EINVAL;
        ret = rw_verify_area(WRITE, out, ppos, len);
        if (unlikely(ret < 0))
                return ret;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index d81fb9ed2b8e..73db464cd08b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-        /* And make sure we have twice the index size of space reserved */
+        /* And make sure we have thrice the index size of space reserved */
-        idx_size <<= 1;
+        idx_size = idx_size + (idx_size << 1);
        /*
         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
        int subtract_lebs;
        long long available;
-        /*
-         * Force the amount available to the total size reported if the used
-         * space is zero.
-         */
-        if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
-            c->budg_data_growth + c->budg_dd_growth == 0) {
-                /* Do the same calculation as for c->block_cnt */
-                available = c->main_lebs - 2;
-                available *= c->leb_size - c->dark_wm;
-                return available;
-        }
        available = c->main_bytes - c->lst.total_used;
        /*
@@ -388,11 +376,11 @@ static int can_use_rp(struct ubifs_info *c)
 * This function makes sure UBIFS has enough free eraseblocks for index growth
 * and data.
 *
- * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
 * would take if it was consolidated and written to the flash. This guarantees
 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
 * be able to commit dirty index. So this function basically adds amount of
- * budgeted index space to the size of the current index, multiplies this by 2,
+ * budgeted index space to the size of the current index, multiplies this by 3,
 * and makes sure this does not exceed the amount of free eraseblocks.
 *
 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
@@ -543,8 +531,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
        int err, idx_growth, data_growth, dd_growth;
        struct retries_info ri;
+        ubifs_assert(req->new_page <= 1);
+        ubifs_assert(req->dirtied_page <= 1);
+        ubifs_assert(req->new_dent <= 1);
+        ubifs_assert(req->mod_dent <= 1);
+        ubifs_assert(req->new_ino <= 1);
+        ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
        ubifs_assert(req->dirtied_ino <= 4);
        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        ubifs_assert(!(req->new_ino_d & 7));
+        ubifs_assert(!(req->dirtied_ino_d & 7));
        data_growth = calc_data_growth(c, req);
        dd_growth = calc_dd_growth(c, req);
@@ -618,8 +614,16 @@ again:
 */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
+        ubifs_assert(req->new_page <= 1);
+        ubifs_assert(req->dirtied_page <= 1);
+        ubifs_assert(req->new_dent <= 1);
+        ubifs_assert(req->mod_dent <= 1);
+        ubifs_assert(req->new_ino <= 1);
+        ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
        ubifs_assert(req->dirtied_ino <= 4);
        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        ubifs_assert(!(req->new_ino_d & 7));
+        ubifs_assert(!(req->dirtied_ino_d & 7));
        if (!req->recalculate) {
                ubifs_assert(req->idx_growth >= 0);
                ubifs_assert(req->data_growth >= 0);
@@ -647,7 +651,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
        ubifs_assert(c->budg_idx_growth >= 0);
        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->budg_dd_growth >= 0);
        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        ubifs_assert(!(c->budg_idx_growth & 7));
+        ubifs_assert(!(c->budg_data_growth & 7));
+        ubifs_assert(!(c->budg_dd_growth & 7));
        spin_unlock(&c->space_lock);
 }
@@ -686,41 +694,114 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
                                      struct ubifs_inode *ui)
 {
-        struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
+        struct ubifs_budget_req req;
-                                       .dirtied_ino_d = ui->data_len};
+        memset(&req, 0, sizeof(struct ubifs_budget_req));
+        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
 /**
- * ubifs_budg_get_free_space - return amount of free space.
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+        int divisor, factor, f;
+        /*
+         * Reported space size is @free * X, where X is UBIFS block size
+         * divided by UBIFS block size + all overhead one data block
+         * introduces. The overhead is the node header + indexing overhead.
+         *
+         * Indexing overhead calculations are based on the following formula:
+         * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+         * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+         * as less than maximum fanout, we assume that each data node
+         * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+         * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+         * for the index.
+         */
+        f = c->fanout > 3 ? c->fanout >> 1 : 2;
+        factor = UBIFS_BLOCK_SIZE;
+        divisor = UBIFS_MAX_DATA_NODE_SZ;
+        divisor += (c->max_idx_node_sz * 3) / (f - 1);
+        free *= factor;
+        do_div(free, divisor);
+        return free;
+}
+/**
+ * ubifs_get_free_space - return amount of free space.
 * @c: UBIFS file-system description object
 *
- * This function returns amount of free space on the file-system.
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
-long long ubifs_budg_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space(struct ubifs_info *c)
 {
-        int min_idx_lebs, rsvd_idx_lebs;
+        int min_idx_lebs, rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        /* Do exactly the same calculations as in 'do_budget_space()' */
        spin_lock(&c->space_lock);
        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        if (min_idx_lebs > c->lst.idx_lebs)
+        /*
-                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+         * Force the amount available to the total size reported if the used
-        else
+         * space is zero.
-                rsvd_idx_lebs = 0;
+         */
+        if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-        if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
-                                - c->lst.taken_empty_lebs) {
                spin_unlock(&c->space_lock);
-                return 0;
+                return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
        }
        available = ubifs_calc_available(c, min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        c->min_idx_lebs = min_idx_lebs;
+        /*
+         * When reporting free space to user-space, UBIFS guarantees that it is
+         * possible to write a file of free space size. This means that for
+         * empty LEBs we may use more precise calculations than
+         * 'ubifs_calc_available()' is using. Namely, we know that in empty
+         * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+         * Thus, amend the available space.
+         *
+         * Note, the calculations below are similar to what we have in
+         * 'do_budget_space()', so refer there for comments.
+         */
+        if (min_idx_lebs > c->lst.idx_lebs)
+                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+        else
+                rsvd_idx_lebs = 0;
+        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+               c->lst.taken_empty_lebs;
+        lebs -= rsvd_idx_lebs;
+        available += lebs * (c->dark_wm - c->leb_overhead);
        spin_unlock(&c->space_lock);
        if (available > outstanding)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 3b516316c9b3..0a6aa2cc78f0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c)
                        goto out_up;
        }
+        c->cmt_no += 1;
        err = ubifs_gc_start_commit(c);
        if (err)
                goto out_up;
@@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c)
                goto out;
        mutex_lock(&c->mst_mutex);
-        c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no);
+        c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
        c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
        c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
        c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4e3aaeba4eca..d7f7645779f2 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
                for (i = 0; i < n; i++)
                        printk(KERN_DEBUG "\t  ino %llu\n",
-                               le64_to_cpu(orph->inos[i]));
+                               (unsigned long long)le64_to_cpu(orph->inos[i]));
                break;
        }
        default:
@@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
 {
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+        printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
-               lst->empty_lebs, lst->idx_lebs);
+               "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
        printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
               "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
               lst->total_dirty);
@@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct ubifs_gced_idx_leb *idx_gc;
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
+        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
-               "budg_dd_growth %lld, budg_idx_growth %lld\n",
+               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
@@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "Dumping LEB properties\n");
+        printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_failure_mode)
                return;
-        printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+        printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
        if (IS_ERR(sleb)) {
@@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
+        printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
-               cat, heap->cnt);
+               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
@@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "Dumping pnode:\n");
+        printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
        int level;
        printk(KERN_DEBUG "\n");
-        printk(KERN_DEBUG "Dumping the TNC tree\n");
+        printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
        printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
                  int offset, int len, int dtype)
 {
-        int err;
+        int err, failing;
        if (in_failure_mode(desc))
                return -EIO;
-        if (do_fail(desc, lnum, 1))
+        failing = do_fail(desc, lnum, 1);
+        if (failing)
                cut_data(buf, len);
        err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
        if (err)
                return err;
-        if (in_failure_mode(desc))
+        if (failing)
                return -EIO;
        return 0;
 }
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 3c4f1e93c9e0..50315fc57185 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,7 @@
 #define UBIFS_DBG(op) op
-#define ubifs_assert(expr)  do {                                               \
+#define ubifs_assert(expr) do {                                                \
        if (unlikely(!(expr))) {                                               \
                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
                       __func__, __LINE__, current->pid);                      \
@@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c,
                         const union ubifs_key *key);
 /*
- * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
 * macros.
 */
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
 /* General messages */
-#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 /*
 * Debugging message type flags (must match msg_type_names in debug.c).
@@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
                                 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
                                  struct ubifs_znode *znode, void *priv);
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
                   dbg_znode_callback znode_cb, void *priv);
 /* Checking functions */
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_cats(struct ubifs_info *c);
 int dbg_check_ltab(struct ubifs_info *c);
 int dbg_check_synced_i_size(struct inode *inode);
 int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
 int dbg_check_tnc(struct ubifs_info *c, int extra);
 int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
 int dbg_check_filesystem(struct ubifs_info *c);
 void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
                    int add_pos);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
@@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #else /* !CONFIG_UBIFS_FS_DEBUG */
 #define UBIFS_DBG(op)
-#define ubifs_assert(expr)                         ({})
-#define ubifs_assert_cmt_locked(c)
+/* Use "if (0)" to make compiler check arguments even if debugging is off */
+#define ubifs_assert(expr)  do {                                               \
+        if (0 && (expr))                                                       \
+                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+                       __func__, __LINE__, current->pid);                      \
+} while (0)
+#define dbg_err(fmt, ...)   do {                                               \
+        if (0)                                                                 \
+                ubifs_err(fmt, ##__VA_ARGS__);                                 \
+} while (0)
+#define dbg_msg(fmt, ...) do {                                                 \
+        if (0)                                                                 \
+                printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+                       current->pid, __func__, ##__VA_ARGS__);                 \
+} while (0)
 #define dbg_dump_stack()
-#define dbg_err(fmt, ...)                          ({})
+#define ubifs_assert_cmt_locked(c)
-#define dbg_msg(fmt, ...)                          ({})
-#define dbg_key(c, key, fmt, ...)                  ({})
-#define dbg_gen(fmt, ...)                          ({})
-#define dbg_jnl(fmt, ...)                          ({})
-#define dbg_tnc(fmt, ...)                          ({})
-#define dbg_lp(fmt, ...)                           ({})
-#define dbg_find(fmt, ...)                         ({})
-#define dbg_mnt(fmt, ...)                          ({})
-#define dbg_io(fmt, ...)                           ({})
-#define dbg_cmt(fmt, ...)                          ({})
-#define dbg_budg(fmt, ...)                         ({})
-#define dbg_log(fmt, ...)                          ({})
-#define dbg_gc(fmt, ...)                           ({})
-#define dbg_scan(fmt, ...)                         ({})
-#define dbg_rcvry(fmt, ...)                        ({})
-#define dbg_ntype(type)                            ""
-#define dbg_cstate(cmt_state)                      ""
-#define dbg_get_key_dump(c, key)                   ({})
-#define dbg_dump_inode(c, inode)                   ({})
-#define dbg_dump_node(c, node)                     ({})
-#define dbg_dump_budget_req(req)                   ({})
-#define dbg_dump_lstats(lst)                       ({})
-#define dbg_dump_budg(c)                           ({})
-#define dbg_dump_lprop(c, lp)                      ({})
-#define dbg_dump_lprops(c)                         ({})
-#define dbg_dump_leb(c, lnum)                      ({})
-#define dbg_dump_znode(c, znode)                   ({})
-#define dbg_dump_heap(c, heap, cat)                ({})
-#define dbg_dump_pnode(c, pnode, parent, iip)      ({})
-#define dbg_dump_tnc(c)                            ({})
-#define dbg_dump_index(c)                          ({})
-#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define DBGKEY(key)  ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+#define dbg_ntype(type)                       ""
+#define dbg_cstate(cmt_state)                 ""
+#define dbg_get_key_dump(c, key)              ({})
+#define dbg_dump_inode(c, inode)              ({})
+#define dbg_dump_node(c, node)                ({})
+#define dbg_dump_budget_req(req)              ({})
+#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_index(c)                     ({})
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
 #define dbg_check_synced_i_size(inode)             0
 #define dbg_check_dir_size(c, dir)                 0
 #define dbg_check_tnc(c, x)                        0
 #define dbg_check_idx_size(c, idx_size)            0
 #define dbg_check_filesystem(c)                    0
 #define dbg_check_heap(c, heap, cat, add_pos)      ({})
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
 #define dbg_failure_mode_registration(c)           ({})
 #define dbg_failure_mode_deregistration(c)         ({})
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e90374be7d3b..526c01ec8003 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
        }
        inode->i_ino = ++c->highest_inum;
-        inode->i_generation = ++c->vfs_gen;
        /*
         * The creation sequence number remains with this inode for its
         * lifetime. All nodes for this inode have a greater sequence number,
@@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
        err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
        if (err) {
-                /*
+                if (err == -ENOENT) {
-                 * Do not hash the direntry if parent 'i_nlink' is zero, because
-                 * this has side-effects - '->delete_inode()' call will not be
-                 * called for the parent orphan inode, because 'd_count' of its
-                 * direntry will stay 1 (it'll be negative direntry I guess)
-                 * and prevent 'iput_final()' until the dentry is destroyed due
-                 * to unmount or memory pressure.
-                 */
-                if (err == -ENOENT && dir->i_nlink != 0) {
                        dbg_gen("not found");
                        goto done;
                }
@@ -435,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
        while (1) {
                dbg_gen("feed '%s', ino %llu, new f_pos %#x",
-                        dent->name, le64_to_cpu(dent->inum),
+                        dent->name, (unsigned long long)le64_to_cpu(dent->inum),
                        key_hash_flash(c, &dent->key));
                ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
@@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
-                                        .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
        /*
         * Budget request settings: new direntry, changing the target inode,
@@ -596,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
        if (err) {
                if (err != -ENOSPC)
                        return err;
-                err = 0;
                budgeted = 0;
        }
@@ -727,8 +717,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        struct ubifs_info *c = dir->i_sb->s_fs_info;
        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
-        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
-                                        .dirtied_ino_d = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -789,7 +778,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        int err, devlen = 0;
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = devlen, .dirtied_ino = 1 };
+                                        .new_ino_d = ALIGN(devlen, 8),
+                                        .dirtied_ino = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -863,7 +853,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
        int err, len = strlen(symname);
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = len, .dirtied_ino = 1 };
+                                        .new_ino_d = ALIGN(len, 8),
+                                        .dirtied_ino = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -1012,7 +1003,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
                                        .dirtied_ino = 3 };
        struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
-                                .dirtied_ino_d = old_inode_ui->data_len };
+                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
        /*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8565e586e533..3d698e2022b1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        int err;
        struct ubifs_budget_req req;
        loff_t old_size = inode->i_size, new_size = attr->ia_size;
-        int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+        int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
        struct ubifs_inode *ui = ubifs_inode(inode);
        dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        /* A funny way to budget for truncation node */
        req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
        err = ubifs_budget_space(c, &req);
-        if (err)
+        if (err) {
-                return err;
+                /*
+                 * Treat truncations to zero as deletion and always allow them,
+                 * just like we do for '->unlink()'.
+                 */
+                if (new_size || err != -ENOSPC)
+                        return err;
+                budgeted = 0;
+        }
        err = vmtruncate(inode, new_size);
        if (err)
@@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        err = ubifs_jnl_truncate(c, inode, old_size, new_size);
        mutex_unlock(&ui->ui_mutex);
 out_budg:
-        ubifs_release_budget(c, &req);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        else {
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
        return err;
 }
@@ -890,7 +902,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        loff_t new_size = attr->ia_size;
        struct ubifs_inode *ui = ubifs_inode(inode);
        struct ubifs_budget_req req = { .dirtied_ino = 1,
-                                        .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
        err = ubifs_budget_space(c, &req);
        if (err)
@@ -941,7 +953,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
-        dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+        dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+                inode->i_ino, inode->i_mode, attr->ia_valid);
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -1051,7 +1064,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
        if (mctime_update_needed(inode, &now)) {
                int err, release;
                struct ubifs_budget_req req = { .dirtied_ino = 1,
-                                                .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
                err = ubifs_budget_space(c, &req);
                if (err)
@@ -1270,6 +1283,7 @@ struct file_operations ubifs_file_operations = {
        .fsync          = ubifs_fsync,
        .unlocked_ioctl = ubifs_ioctl,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 10394c548367..47814cde2407 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
 * or do not have an LEB which satisfies the @min_space criteria.
 *
- * Note:
+ * Note, LEBs which have less than dead watermark of free + dirty space are
- *   o LEBs which have less than dead watermark of dirty space are never picked
+ * never picked by this function.
- *   by this function;
- *
- * Returns zero and the LEB properties of
- * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
- * negative error code in case of other failures. The returned LEB is marked as
- * "taken".
 *
 * The additional @pick_free argument controls if this function has to return a
 * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 *
 * In addition @pick_free is set to %2 by the recovery process in order to
 * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
 */
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                         int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                int lebs, rsvd_idx_lebs = 0;
                spin_lock(&c->space_lock);
-                lebs = c->lst.empty_lebs;
+                lebs = c->lst.empty_lebs + c->idx_gc_cnt;
                lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
                /*
@@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                idx_lp = idx_heap->arr[0];
                sum = idx_lp->free + idx_lp->dirty;
                /*
-                 * Since we reserve twice as more space for the index than it
+                 * Since we reserve thrice as much space for the index than it
                 * actually takes, it does not make sense to pick indexing LEBs
-                 * with less than half LEB of dirty space.
+                 * with less than, say, half LEB of dirty space. May be half is
+                 * not the optimal boundary - this should be tested and
+                 * checked. This boundary should determine how much we use
+                 * in-the-gaps to consolidate the index comparing to how much
+                 * we use garbage collector to consolidate it. The "half"
+                 * criteria just feels to be fine.
                 */
                if (sum < min_space || sum < c->half_leb_size)
                        idx_lp = NULL;
@@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                lp = idx_lp;
        if (lp) {
-                ubifs_assert(lp->dirty >= c->dead_wm);
+                ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
                goto found;
        }
@@ -504,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
                rsvd_idx_lebs = 0;
        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
               c->lst.taken_empty_lebs;
-        ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
        if (rsvd_idx_lebs < lebs)
                /*
                 * OK to allocate an empty LEB, but we still don't want to go
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..02aba36fe3d4 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
                err = move_nodes(c, sleb);
                if (err)
-                        goto out;
+                        goto out_inc_seq;
                err = gc_sync_wbufs(c);
                if (err)
-                        goto out;
+                        goto out_inc_seq;
                err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
                if (err)
-                        goto out;
+                        goto out_inc_seq;
+                /* Allow for races with TNC */
+                c->gced_lnum = lnum;
+                smp_wmb();
+                c->gc_seq += 1;
+                smp_wmb();
                if (c->gc_lnum == -1) {
                        c->gc_lnum = lnum;
@@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 out:
        ubifs_scan_destroy(sleb);
        return err;
+out_inc_seq:
+        /* We may have moved at least some nodes so allow for races with TNC */
+        c->gced_lnum = lnum;
+        smp_wmb();
+        c->gc_seq += 1;
+        smp_wmb();
+        goto out;
 }
 /**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3374f91b6709..054363f2b207 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -54,6 +54,20 @@
 #include "ubifs.h"
 /**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+        if (!c->ro_media) {
+                c->ro_media = 1;
+                ubifs_warn("switched to read-only mode, error %d", err);
+                dbg_dump_stack();
+        }
+}
+/**
 * ubifs_check_node - check node.
 * @c: UBIFS file-system description object
 * @buf: node to check
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 283155abe5f5..22993f867d19 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -447,13 +447,11 @@ static int get_dent_type(int mode)
 * @ino: buffer in which to pack inode node
 * @inode: inode to pack
 * @last: indicates the last node of the group
- * @last_reference: non-zero if this is a deletion inode
 */
 static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
-                       const struct inode *inode, int last,
+                       const struct inode *inode, int last)
-                       int last_reference)
 {
-        int data_len = 0;
+        int data_len = 0, last_reference = !inode->i_nlink;
        struct ubifs_inode *ui = ubifs_inode(inode);
        ino->ch.node_type = UBIFS_INO_NODE;
@@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
        ubifs_prep_grp_node(c, dent, dlen, 0);
        ino = (void *)dent + aligned_dlen;
-        pack_inode(c, ino, inode, 0, last_reference);
+        pack_inode(c, ino, inode, 0);
        ino = (void *)ino + aligned_ilen;
-        pack_inode(c, ino, dir, 1, 0);
+        pack_inode(c, ino, dir, 1);
        if (last_reference) {
                err = ubifs_add_orphan(c, inode->i_ino);
@@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
                        release_head(c, BASEHD);
                        goto out_finish;
                }
+                ui->del_cmtno = c->cmt_no;
        }
        err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -750,30 +749,25 @@ out_free:
 * ubifs_jnl_write_inode - flush inode to the journal.
 * @c: UBIFS file-system description object
 * @inode: inode to flush
- * @deletion: inode has been deleted
 *
 * This function writes inode @inode to the journal. If the inode is
 * synchronous, it also synchronizes the write-buffer. Returns zero in case of
 * success and a negative error code in case of failure.
 */
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
-                          int deletion)
 {
-        int err, len, lnum, offs, sync = 0;
+        int err, lnum, offs;
        struct ubifs_ino_node *ino;
        struct ubifs_inode *ui = ubifs_inode(inode);
+        int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
-        dbg_jnl("ino %lu%s", inode->i_ino,
+        dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
-                deletion ? " (last reference)" : "");
-        if (deletion)
-                ubifs_assert(inode->i_nlink == 0);
-        len = UBIFS_INO_NODE_SZ;
        /*
         * If the inode is being deleted, do not write the attached data. No
         * need to synchronize the write-buffer either.
         */
-        if (!deletion) {
+        if (!last_reference) {
                len += ui->data_len;
                sync = IS_SYNC(inode);
        }
@@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, inode, 1, deletion);
+        pack_inode(c, ino, inode, 1);
        err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
        if (err)
                goto out_release;
@@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
                                          inode->i_ino);
        release_head(c, BASEHD);
-        if (deletion) {
+        if (last_reference) {
                err = ubifs_tnc_remove_ino(c, inode->i_ino);
                if (err)
                        goto out_ro;
@@ -828,6 +822,65 @@ out_free:
 }
 /**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+        int err;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ubifs_assert(inode->i_nlink == 0);
+        if (ui->del_cmtno != c->cmt_no)
+                /* A commit happened for sure */
+                return ubifs_jnl_write_inode(c, inode);
+        down_read(&c->commit_sem);
+        /*
+         * Check commit number again, because the first test has been done
+         * without @c->commit_sem, so a commit might have happened.
+         */
+        if (ui->del_cmtno != c->cmt_no) {
+                up_read(&c->commit_sem);
+                return ubifs_jnl_write_inode(c, inode);
+        }
+        err = ubifs_tnc_remove_ino(c, inode->i_ino);
+        if (err)
+                ubifs_ro_mode(c, err);
+        else
+                ubifs_delete_orphan(c, inode->i_ino);
+        up_read(&c->commit_sem);
+        return err;
+}
+/**
 * ubifs_jnl_rename - rename a directory entry.
 * @c: UBIFS file-system description object
 * @old_dir: parent inode of directory entry to rename
@@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
        p = (void *)dent2 + aligned_dlen2;
        if (new_inode) {
-                pack_inode(c, p, new_inode, 0, last_reference);
+                pack_inode(c, p, new_inode, 0);
                p += ALIGN(ilen, 8);
        }
        if (!move)
-                pack_inode(c, p, old_dir, 1, 0);
+                pack_inode(c, p, old_dir, 1);
        else {
-                pack_inode(c, p, old_dir, 0, 0);
+                pack_inode(c, p, old_dir, 0);
                p += ALIGN(plen, 8);
-                pack_inode(c, p, new_dir, 1, 0);
+                pack_inode(c, p, new_dir, 1);
        }
        if (last_reference) {
@@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
                        release_head(c, BASEHD);
                        goto out_finish;
                }
+                new_ui->del_cmtno = c->cmt_no;
        }
        err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, inode, 0, 0);
+        pack_inode(c, ino, inode, 0);
        ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
        if (dlen)
                ubifs_prep_grp_node(c, dn, dlen, 1);
@@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
        ubifs_prep_grp_node(c, xent, xlen, 0);
        ino = (void *)xent + aligned_xlen;
-        pack_inode(c, ino, inode, 0, 1);
+        pack_inode(c, ino, inode, 0);
        ino = (void *)ino + UBIFS_INO_NODE_SZ;
-        pack_inode(c, ino, host, 1, 0);
+        pack_inode(c, ino, host, 1);
        err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
        if (!sync && !err)
@@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
                           const struct inode *host)
 {
        int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
-        struct ubifs_inode *host_ui = ubifs_inode(inode);
+        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_ino_node *ino;
        union ubifs_key key;
        int sync = IS_DIRSYNC(host);
@@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, host, 0, 0);
+        pack_inode(c, ino, host, 0);
-        pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+        pack_inode(c, (void *)ino + aligned_len1, inode, 1);
        err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
        if (!sync && !err) {
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36857b9ed59e..3e0aa7367556 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        return 0;
 out_unlock:
+        if (err != -EAGAIN)
+                ubifs_ro_mode(c, err);
        mutex_unlock(&c->log_mutex);
        kfree(ref);
        kfree(bud);
@@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
                return -ENOMEM;
        cs->ch.node_type = UBIFS_CS_NODE;
-        cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+        cs->cmt_no = cpu_to_le64(c->cmt_no);
        ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
        /*
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4beccfc256d2..4c12a9215d7f 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
 }
 /**
- * ubifs_ro_mode - switch UBIFS to read read-only mode.
- * @c: UBIFS file-system description object
- * @err: error code which is the reason of switching to R/O mode
- */
-static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
-{
-        if (!c->ro_media) {
-                c->ro_media = 1;
-                ubifs_warn("switched to read-only mode, error %d", err);
-                dbg_dump_stack();
-        }
-}
-/**
 * ubifs_compr_present - check if compressor was compiled in.
 * @compr_type: compressor type to check
 *
@@ -298,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
 }
 /**
- * ubifs_reported_space - calculate reported free space.
- * @c: the UBIFS file-system description object
- * @free: amount of free space
- *
- * This function calculates amount of free space which will be reported to
- * user-space. User-space application tend to expect that if the file-system
- * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
- * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
- *
- * This function assumes free space is made up of uncompressed data nodes and
- * full index nodes (one per data node, doubled because we always allow enough
- * space to write the index twice).
- *
- * Note, the calculation is pessimistic, which means that most of the time
- * UBIFS reports less space than it actually has.
- */
-static inline long long ubifs_reported_space(const struct ubifs_info *c,
-                                             uint64_t free)
-{
-        int divisor, factor;
-        divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
-        factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
-        do_div(free, divisor);
-        return free * factor;
-}
-/**
 * ubifs_current_time - round current time to time granularity.
 * @inode: inode
 */
@@ -339,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+                                   const union ubifs_key *key, void *node)
+{
+        return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 3afeb9242c6a..02d3462f4d3e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
        c->cmt_orphans -= cnt;
        spin_unlock(&c->orphan_lock);
        if (c->cmt_orphans)
-                orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+                orph->cmt_no = cpu_to_le64(c->cmt_no);
        else
                /* Mark the last node of the commit */
-                orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+                orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
        ubifs_assert(c->ohead_offs + len <= c->leb_size);
        ubifs_assert(c->ohead_lnum >= c->orph_first);
        ubifs_assert(c->ohead_lnum <= c->orph_last);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ca1e2d4e03cc..9a9220333b3b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -30,7 +30,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
-#include <linux/random.h>
 #include <linux/kthread.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
@@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
        if (err)
                goto out_invalid;
-        /* Disable readahead */
+        /* Disable read-ahead */
        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (inode->i_mode & S_IFMT) {
@@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode)
 */
 static int ubifs_write_inode(struct inode *inode, int wait)
 {
-        int err;
+        int err = 0;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        struct ubifs_inode *ui = ubifs_inode(inode);
@@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait)
                return 0;
        }
-        dbg_gen("inode %lu", inode->i_ino);
+        /*
-        err = ubifs_jnl_write_inode(c, inode, 0);
+         * As an optimization, do not write orphan inodes to the media just
-        if (err)
+         * because this is not needed.
-                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+         */
+        dbg_gen("inode %lu, mode %#x, nlink %u",
+                inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+        if (inode->i_nlink) {
+                err = ubifs_jnl_write_inode(c, inode);
+                if (err)
+                        ubifs_err("can't write inode %lu, error %d",
+                                  inode->i_ino, err);
+        }
        ui->dirty = 0;
        mutex_unlock(&ui->ui_mutex);
@@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode)
 {
        int err;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct ubifs_inode *ui = ubifs_inode(inode);
-        if (ubifs_inode(inode)->xattr)
+        if (ui->xattr)
                /*
                 * Extended attribute inode deletions are fully handled in
                 * 'ubifs_removexattr()'. These inodes are special and have
@@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode)
                 */
                goto out;
-        dbg_gen("inode %lu", inode->i_ino);
+        dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
        ubifs_assert(!atomic_read(&inode->i_count));
        ubifs_assert(inode->i_nlink == 0);
@@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode)
        if (is_bad_inode(inode))
                goto out;
-        ubifs_inode(inode)->ui_size = inode->i_size = 0;
+        ui->ui_size = inode->i_size = 0;
-        err = ubifs_jnl_write_inode(c, inode, 1);
+        err = ubifs_jnl_delete_inode(c, inode);
        if (err)
                /*
                 * Worst case we have a lost orphan inode wasting space, so a
-                 * simple error message is ok here.
+                 * simple error message is OK here.
                 */
-                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+                ubifs_err("can't delete inode %lu, error %d",
+                          inode->i_ino, err);
 out:
+        if (ui->dirty)
+                ubifs_release_dirty_inode_budget(c, ui);
        clear_inode(inode);
 }
@@ -358,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct ubifs_info *c = dentry->d_sb->s_fs_info;
        unsigned long long free;
+        __le32 *uuid = (__le32 *)c->uuid;
-        free = ubifs_budg_get_free_space(c);
+        free = ubifs_get_free_space(c);
        dbg_gen("free space %lld bytes (%lld blocks)",
                free, free >> UBIFS_BLOCK_SHIFT);
@@ -374,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = 0;
        buf->f_ffree = 0;
        buf->f_namelen = UBIFS_MAX_NLEN;
+        buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+        buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
        return 0;
 }
@@ -518,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)
        c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
        c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+        /*
+         * Calculate how many bytes would be wasted at the end of LEB if it was
+         * fully filled with data nodes of maximum size. This is used in
+         * calculations when reporting free space.
+         */
+        c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
        return 0;
 }
@@ -635,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)
         * internally because it does not make much sense for UBIFS, but it is
         * necessary to report something for the 'statfs()' call.
         *
-         * Subtract the LEB reserved for GC and the LEB which is reserved for
+         * Subtract the LEB reserved for GC, the LEB which is reserved for
-         * deletions.
+         * deletions, and assume only one journal head is available.
-         *
-         * Review 'ubifs_calc_available()' if changing this calculation.
         */
-        tmp64 = c->main_lebs - 2;
+        tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
-        tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+        tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
        tmp64 = ubifs_reported_space(c, tmp64);
        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
@@ -830,7 +848,7 @@ enum {
        Opt_err,
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_fast_unmount, "fast_unmount"},
        {Opt_norm_unmount, "norm_unmount"},
        {Opt_err, NULL},
@@ -1006,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_dereg;
        }
+        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
        if (!mounted_read_only) {
                err = alloc_wbufs(c);
                if (err)
                        goto out_cbuf;
                /* Create background thread */
-                sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
-                        c->vi.vol_id);
                c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
                if (!c->bgt)
                        c->bgt = ERR_PTR(-EINVAL);
@@ -1122,8 +1139,8 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_infos;
-        ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
+        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
-                  c->vi.vol_id);
+                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
        if (mounted_read_only)
                ubifs_msg("mounted read-only");
        x = (long long)c->main_lebs * c->leb_size;
@@ -1469,6 +1486,7 @@ static void ubifs_put_super(struct super_block *sb)
         */
        ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
        ubifs_assert(c->budg_idx_growth == 0);
+        ubifs_assert(c->budg_dd_growth == 0);
        ubifs_assert(c->budg_data_growth == 0);
        /*
@@ -1657,7 +1675,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->orph_new);
        c->highest_inum = UBIFS_FIRST_INO;
-        get_random_bytes(&c->vfs_gen, sizeof(int));
        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
        ubi_get_volume_info(ubi, &c->vi);
@@ -1671,10 +1688,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /*
-         * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+         * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
         * UBIFS, I/O is not deferred, it is done immediately in readpage,
         * which means the user would have to wait not just for their own I/O
-         * but the readahead I/O as well i.e. completely pointless.
+         * but the read-ahead I/O as well i.e. completely pointless.
         *
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..7634c5970887 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
                if (keys_cmp(c, key, &node_key) != 0)
                        ret = 0;
        }
-        if (ret == 0)
+        if (ret == 0 && c->replaying)
                dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
                        zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
        return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
 }
 /**
- * ubifs_tnc_lookup - look up a file-system node.
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
 * @c: UBIFS file-system description object
- * @key: node key to lookup
+ * @lnum: LEB number
- * @node: the node is returned here
+ * @gc_seq1: garbage collection sequence number
 *
- * This function look up and reads node with key @key. The caller has to make
+ * This function determines if @lnum may have been garbage collected since
- * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
- * of success, %-ENOENT if the node was not found, and a negative error code in
+ * %0 is returned.
- * case of failure.
 */
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
-                     void *node)
 {
-        int found, n, err;
+        int gc_seq2, gced_lnum;
-        struct ubifs_znode *znode;
-        struct ubifs_zbranch zbr, *zt;
-        mutex_lock(&c->tnc_mutex);
+        gced_lnum = c->gced_lnum;
-        found = ubifs_lookup_level0(c, key, &znode, &n);
+        smp_rmb();
-        if (!found) {
+        gc_seq2 = c->gc_seq;
-                err = -ENOENT;
+        /* Same seq means no GC */
-                goto out;
+        if (gc_seq1 == gc_seq2)
-        } else if (found < 0) {
+                return 0;
-                err = found;
+        /* Different by more than 1 means we don't know */
-                goto out;
+        if (gc_seq1 + 1 != gc_seq2)
-        }
+                return 1;
-        zt = &znode->zbranch[n];
+        /*
-        if (is_hash_key(c, key)) {
+         * We have seen the sequence number has increased by 1. Now we need to
-                /*
+         * be sure we read the right LEB number, so read it again.
-                 * In this case the leaf node cache gets used, so we pass the
+         */
-                 * address of the zbranch and keep the mutex locked
+        smp_rmb();
-                 */
+        if (gced_lnum != c->gced_lnum)
-                err = tnc_read_node_nm(c, zt, node);
+                return 1;
-                goto out;
+        /* Finally we can check lnum */
-        }
+        if (gced_lnum == lnum)
-        zbr = znode->zbranch[n];
+                return 1;
-        mutex_unlock(&c->tnc_mutex);
+        return 0;
-        err = ubifs_tnc_read_node(c, &zbr, node);
-        return err;
-out:
-        mutex_unlock(&c->tnc_mutex);
-        return err;
 }
 /**
@@ -1436,16 +1425,19 @@ out:
 * @lnum: LEB number is returned here
 * @offs: offset is returned here
 *
- * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
+ * This function look up and reads node with key @key. The caller has to make
- * location also. See 'ubifs_tnc_lookup()'.
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
 */
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
                     void *node, int *lnum, int *offs)
 {
-        int found, n, err;
+        int found, n, err, safely = 0, gc_seq1;
        struct ubifs_znode *znode;
        struct ubifs_zbranch zbr, *zt;
+again:
        mutex_lock(&c->tnc_mutex);
        found = ubifs_lookup_level0(c, key, &znode, &n);
        if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
                goto out;
        }
        zt = &znode->zbranch[n];
+        if (lnum) {
+                *lnum = zt->lnum;
+                *offs = zt->offs;
+        }
        if (is_hash_key(c, key)) {
                /*
                 * In this case the leaf node cache gets used, so we pass the
                 * address of the zbranch and keep the mutex locked
                 */
-                *lnum = zt->lnum;
-                *offs = zt->offs;
                err = tnc_read_node_nm(c, zt, node);
                goto out;
        }
+        if (safely) {
+                err = ubifs_tnc_read_node(c, zt, node);
+                goto out;
+        }
+        /* Drop the TNC mutex prematurely and race with garbage collection */
        zbr = znode->zbranch[n];
+        gc_seq1 = c->gc_seq;
        mutex_unlock(&c->tnc_mutex);
-        *lnum = zbr.lnum;
+        if (ubifs_get_wbuf(c, zbr.lnum)) {
-        *offs = zbr.offs;
+                /* We do not GC journal heads */
+                err = ubifs_tnc_read_node(c, &zbr, node);
+                return err;
+        }
-        err = ubifs_tnc_read_node(c, &zbr, node);
+        err = fallible_read_node(c, key, &zbr, node);
-        return err;
+        if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+                /*
+                 * The node may have been GC'ed out from under us so try again
+                 * while keeping the TNC mutex locked.
+                 */
+                safely = 1;
+                goto again;
+        }
+        return 0;
 out:
        mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 {
        int found, n, err;
        struct ubifs_znode *znode;
-        struct ubifs_zbranch zbr;
        dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
        mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
                goto out_unlock;
        }
-        zbr = znode->zbranch[n];
+        err = tnc_read_node_nm(c, &znode->zbranch[n], node);
-        mutex_unlock(&c->tnc_mutex);
-        err = tnc_read_node_nm(c, &zbr, node);
-        return err;
 out_unlock:
        mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8117e65ba2e9..8ac76b1c2d55 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                written = layout_leb_in_gaps(c, p);
                if (written < 0) {
                        err = written;
-                        if (err == -ENOSPC) {
+                        if (err != -ENOSPC) {
-                                if (!dbg_force_in_the_gaps_enabled) {
+                                kfree(c->gap_lebs);
-                                        /*
+                                c->gap_lebs = NULL;
-                                         * Do not print scary warnings if the
+                                return err;
-                                         * debugging option which forces
-                                         * in-the-gaps is enabled.
-                                         */
-                                        ubifs_err("out of space");
-                                        spin_lock(&c->space_lock);
-                                        dbg_dump_budg(c);
-                                        spin_unlock(&c->space_lock);
-                                        dbg_dump_lprops(c);
-                                }
-                                /* Try to commit anyway */
-                                err = 0;
-                                break;
                        }
-                        kfree(c->gap_lebs);
+                        if (!dbg_force_in_the_gaps_enabled) {
-                        c->gap_lebs = NULL;
+                                /*
-                        return err;
+                                 * Do not print scary warnings if the debugging
+                                 * option which forces in-the-gaps is enabled.
+                                 */
+                                ubifs_err("out of space");
+                                spin_lock(&c->space_lock);
+                                dbg_dump_budg(c);
+                                spin_unlock(&c->space_lock);
+                                dbg_dump_lprops(c);
+                        }
+                        /* Try to commit anyway */
+                        err = 0;
+                        break;
                }
                p++;
                cnt -= written;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0cc7da9bed47..a9ecbd9af20d 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
 #define UBIFS_SK_LEN 8
 /* Minimum index tree fanout */
-#define UBIFS_MIN_FANOUT 2
+#define UBIFS_MIN_FANOUT 3
 /* Maximum number of levels in UBIFS indexing B-tree */
 #define UBIFS_MAX_LEVELS 512
@@ -228,10 +228,10 @@ enum {
 /* Minimum number of orphan area logical eraseblocks */
 #define UBIFS_MIN_ORPH_LEBS 1
 /*
- * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
 * for GC, 1 for deletions, and at least 1 for committed data).
 */
-#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
 /* Minimum number of logical eraseblocks */
 #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e4f89f271827..17c620b93eec 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -20,8 +20,6 @@
 *          Adrian Hunter
 */
-/* Implementation version 0.7 */
 #ifndef __UBIFS_H__
 #define __UBIFS_H__
@@ -322,6 +320,8 @@ struct ubifs_gced_idx_leb {
 * struct ubifs_inode - UBIFS in-memory inode description.
 * @vfs_inode: VFS inode description object
 * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ *             protected by @c->commit_sem;
 * @xattr_size: summarized size of all extended attributes in bytes
 * @xattr_cnt: count of extended attributes this inode has
 * @xattr_names: sum of lengths of all extended attribute names belonging to
@@ -373,6 +373,7 @@ struct ubifs_gced_idx_leb {
 struct ubifs_inode {
        struct inode vfs_inode;
        unsigned long long creat_sqnum;
+        unsigned long long del_cmtno;
        unsigned int xattr_size;
        unsigned int xattr_cnt;
        unsigned int xattr_names;
@@ -779,7 +780,7 @@ struct ubifs_compressor {
 /**
 * struct ubifs_budget_req - budget requirements of an operation.
 *
- * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
 *        should not try to call write-back
 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
 *               have to be re-calculated
@@ -805,21 +806,31 @@ struct ubifs_compressor {
 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
 * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
 */
 struct ubifs_budget_req {
        unsigned int fast:1;
        unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
        unsigned int new_page:1;
        unsigned int dirtied_page:1;
        unsigned int new_dent:1;
        unsigned int mod_dent:1;
        unsigned int new_ino:1;
        unsigned int new_ino_d:13;
-#ifndef UBIFS_DEBUG
        unsigned int dirtied_ino:4;
        unsigned int dirtied_ino_d:15;
 #else
        /* Not bit-fields to check for overflows */
+        unsigned int new_page;
+        unsigned int dirtied_page;
+        unsigned int new_dent;
+        unsigned int mod_dent;
+        unsigned int new_ino;
+        unsigned int new_ino_d;
        unsigned int dirtied_ino;
        unsigned int dirtied_ino_d;
 #endif
@@ -860,13 +871,13 @@ struct ubifs_mount_opts {
 * struct ubifs_info - UBIFS file-system description data structure
 * (per-superblock).
 * @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable readahead
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
 *
 * @highest_inum: highest used inode number
- * @vfs_gen: VFS inode generation counter
 * @max_sqnum: current global sequence number
- * @cmt_no: commit number (last successfully completed commit)
+ * @cmt_no: commit number of the last successfully completed commit, protected
- * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ *          by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
 * @fmt_version: UBIFS on-flash format version
 * @uuid: UUID from super block
 *
@@ -984,6 +995,9 @@ struct ubifs_mount_opts {
 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
 * @max_inode_sz: maximum possible inode size in bytes
 * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
 * @dead_wm: LEB dead space watermark
 * @dark_wm: LEB dark space watermark
 * @block_cnt: count of 4KiB blocks on the FS
@@ -1017,6 +1031,8 @@ struct ubifs_mount_opts {
 * @sbuf: a buffer of LEB size used by GC and replay for scanning
 * @idx_gc: list of index LEBs that have been garbage collected
 * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
 *
 * @infos_list: links all 'ubifs_info' objects
 * @umount_mutex: serializes shrinker and un-mount
@@ -1103,7 +1119,6 @@ struct ubifs_info {
        struct backing_dev_info bdi;
        ino_t highest_inum;
-        unsigned int vfs_gen;
        unsigned long long max_sqnum;
        unsigned long long cmt_no;
        spinlock_t cnt_lock;
@@ -1214,6 +1229,8 @@ struct ubifs_info {
        int max_idx_node_sz;
        long long max_inode_sz;
        int max_znode_sz;
+        int leb_overhead;
        int dead_wm;
        int dark_wm;
        int block_cnt;
@@ -1247,6 +1264,8 @@ struct ubifs_info {
        void *sbuf;
        struct list_head idx_gc;
        int idx_gc_cnt;
+        volatile int gc_seq;
+        volatile int gced_lnum;
        struct list_head infos_list;
        struct mutex umount_mutex;
@@ -1346,6 +1365,7 @@ extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 /* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
                           int dtype);
@@ -1399,8 +1419,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
                     int deletion, int xent);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                         const union ubifs_key *key, const void *buf, int len);
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
-                          int last_reference);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
                     const struct dentry *old_dentry,
                     const struct inode *new_dir,
@@ -1423,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
                                struct ubifs_budget_req *req);
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
                         struct ubifs_budget_req *req);
-long long ubifs_budg_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
@@ -1440,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
 /* tnc.c */
 int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
                        struct ubifs_znode **zn, int *n);
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-                     void *node);
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
                        void *node, const struct qstr *nm);
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 1388a078e1a9..649bec78b645 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -61,7 +61,7 @@
 /*
 * Limit the number of extended attributes per inode so that the total size
- * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
 */
 #define MAX_XATTRS_PER_INODE 65535
@@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        struct inode *inode;
        struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = size, .dirtied_ino = 1,
+                                .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
-                                        .dirtied_ino_d = host_ui->data_len};
+                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
                return -ENOSPC;
        /*
         * Linux limits the maximum size of the extended attribute names list
-         * to %XATTR_LIST_MAX. This means we should not allow creating more*
+         * to %XATTR_LIST_MAX. This means we should not allow creating more
         * extended attributes if the name list becomes larger. This limitation
         * is artificial for UBIFS, though.
         */
@@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
                goto out_budg;
        }
-        mutex_lock(&host_ui->ui_mutex);
        /* Re-define all operations to be "nothing" */
        inode->i_mapping->a_ops = &none_address_operations;
        inode->i_op = &none_inode_operations;
@@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        ui->data = kmalloc(size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
-                goto out_unlock;
+                goto out_free;
        }
        memcpy(ui->data, value, size);
+        inode->i_size = ui->ui_size = size;
+        ui->data_len = size;
+        mutex_lock(&host_ui->ui_mutex);
        host->i_ctime = ubifs_current_time(host);
        host_ui->xattr_cnt += 1;
        host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        host_ui->xattr_names += nm->len;
-        /*
-         * We do not use i_size_write() because nobody can race with us as we
-         * are holding host @host->i_mutex - every xattr operation for this
-         * inode is serialized by it.
-         */
-        inode->i_size = ui->ui_size = size;
-        ui->data_len = size;
        err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
        if (err)
                goto out_cancel;
@@ -172,8 +167,8 @@ out_cancel:
        host_ui->xattr_cnt -= 1;
        host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
-out_unlock:
        mutex_unlock(&host_ui->ui_mutex);
+out_free:
        make_bad_inode(inode);
        iput(inode);
 out_budg:
@@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_inode *ui = ubifs_inode(inode);
        struct ubifs_budget_req req = { .dirtied_ino = 2,
-                                .dirtied_ino_d = size + host_ui->data_len };
+                .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
        ubifs_assert(ui->data_len == inode->i_size);
        err = ubifs_budget_space(c, &req);
        if (err)
                return err;
-        mutex_lock(&host_ui->ui_mutex);
-        host->i_ctime = ubifs_current_time(host);
-        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
-        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        kfree(ui->data);
        ui->data = kmalloc(size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
-                goto out_unlock;
+                goto out_free;
        }
        memcpy(ui->data, value, size);
        inode->i_size = ui->ui_size = size;
        ui->data_len = size;
+        mutex_lock(&host_ui->ui_mutex);
+        host->i_ctime = ubifs_current_time(host);
+        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        /*
         * It is important to write the host inode after the xattr inode
         * because if the host inode gets synchronized (via 'fsync()'), then
@@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 out_cancel:
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
        host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
-        make_bad_inode(inode);
-out_unlock:
        mutex_unlock(&host_ui->ui_mutex);
+        make_bad_inode(inode);
+out_free:
        ubifs_release_budget(c, &req);
        return err;
 }
@@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
        dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
                host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+        ubifs_assert(mutex_is_locked(&host->i_mutex));
        if (size > UBIFS_MAX_INO_DATA)
                return -ERANGE;
@@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
        if (!xent)
                return -ENOMEM;
-        mutex_lock(&host->i_mutex);
        xent_key_init(c, &key, host->i_ino, &nm);
        err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
        if (err) {
@@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 out_iput:
        iput(inode);
 out_unlock:
-        mutex_unlock(&host->i_mutex);
        kfree(xent);
        return err;
 }
@@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                return -ERANGE;
        lowest_xent_key(c, &key, host->i_ino);
-        mutex_lock(&host->i_mutex);
        while (1) {
                int type;
@@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                pxent = xent;
                key_read(c, &xent->key, &key);
        }
-        mutex_unlock(&host->i_mutex);
        kfree(pxent);
        if (err != -ENOENT) {
@@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
        int err;
        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_inode *ui = ubifs_inode(inode);
-        struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
+        struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
-                                        .dirtied_ino_d = host_ui->data_len };
+                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
        ubifs_assert(ui->data_len == inode->i_size);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
        .release                = udf_release_file,
        .fsync                  = udf_fsync_file,
        .splice_read            = generic_file_splice_read,
+        .llseek                 = generic_file_llseek,
 };
 const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        *err = -ENOSPC;
        iinfo = UDF_I(inode);
-        iinfo->i_unique = 0;
+        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-        iinfo->i_lenExtents = 0;
+                iinfo->i_efe = 1;
-        iinfo->i_next_alloc_block = 0;
+                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-        iinfo->i_next_alloc_goal = 0;
+                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-        iinfo->i_strat4096 = 0;
+                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+                                            sizeof(struct extendedFileEntry),
+                                            GFP_KERNEL);
+        } else {
+                iinfo->i_efe = 0;
+                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+                                            sizeof(struct fileEntry),
+                                            GFP_KERNEL);
+        }
+        if (!iinfo->i_ext.i_data) {
+                iput(inode);
+                *err = -ENOMEM;
+                return NULL;
+        }
        block = udf_new_block(dir->i_sb, NULL,
                              dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                lvhd->uniqueID = cpu_to_le64(uniqueID);
                mark_buffer_dirty(sbi->s_lvid_bh);
        }
+        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
        inode->i_uid = current->fsuid;
        if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
        iinfo->i_use = 0;
-        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-                iinfo->i_efe = 1;
-                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-                                            sizeof(struct extendedFileEntry),
-                                            GFP_KERNEL);
-        } else {
-                iinfo->i_efe = 0;
-                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-                                            sizeof(struct fileEntry),
-                                            GFP_KERNEL);
-        }
-        if (!iinfo->i_ext.i_data) {
-                iput(inode);
-                *err = -ENOMEM;
-                mutex_unlock(&sbi->s_alloc_mutex);
-                return NULL;
-        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
        else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                iinfo->i_crtime = current_fs_time(inode->i_sb);
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        mutex_unlock(&sbi->s_alloc_mutex);
        if (DQUOT_ALLOC_INODE(inode)) {
                DQUOT_DROP(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 5698bbf83bbf..e25e7010627b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -369,7 +369,7 @@ enum {
        Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_novrs,     "novrs"},
        {Opt_nostrict,  "nostrict"},
        {Opt_bs,        "bs=%u"},
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3141969b456d..e65212dfb60e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -309,7 +309,7 @@ enum {
       Opt_err
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_type_old, "ufstype=old"},
        {Opt_type_sunx86, "ufstype=sunx86"},
        {Opt_type_sun, "ufstype=sun"},
@@ -1233,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
        unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
-        struct match_token *tp = tokens;
+        const struct match_token *tp = tokens;
        while (tp->token != Opt_onerror_panic && tp->token != mval)
                ++tp;
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 3abe7e9ceb33..000000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <linux/semaphore.h>
-#include <asm/atomic.h>
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-typedef struct semaphore sema_t;
-#define initnsema(sp, val, name)        sema_init(sp, val)
-#define psema(sp, b)                    down(sp)
-#define vsema(sp)                       up(sp)
-#define freesema(sema)                  do { } while (0)
-static inline int issemalocked(sema_t *sp)
-{
-        return down_trylock(sp) || (up(sp), 0);
-}
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-static inline int cpsema(sema_t *sp)
-{
-        return down_trylock(sp) ? 0 : 1;
-}
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index fa47e43b8b41..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -73,7 +73,6 @@ xfs_page_trace(
        unsigned long   pgoff)
 {
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp = vn_from_inode(inode);
        loff_t          isize = i_size_read(inode);
        loff_t          offset = page_offset(page);
        int             delalloc = -1, unmapped = -1, unwritten = -1;
@@ -81,7 +80,7 @@ xfs_page_trace(
        if (page_has_buffers(page))
                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-        ip = xfs_vtoi(vp);
+        ip = XFS_I(inode);
        if (!ip->i_rwtrace)
                return;
@@ -1339,6 +1338,10 @@ __xfs_get_blocks(
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
+        if (!create && direct && offset >= i_size_read(inode))
+                return 0;
        error = xfs_iomap(XFS_I(inode), offset, size,
                             create ? flags : BMAPI_READ, &iomap, &niomap);
        if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9cc8f0213095..36d5fcd3f593 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -58,7 +58,7 @@ xfs_buf_trace(
                bp, id,
                (void *)(unsigned long)bp->b_flags,
                (void *)(unsigned long)bp->b_hold.counter,
-                (void *)(unsigned long)bp->b_sema.count.counter,
+                (void *)(unsigned long)bp->b_sema.count,
                (void *)current,
                data, ra,
                (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
@@ -253,7 +253,7 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
-        init_MUTEX_LOCKED(&bp->b_iodonesema);
+        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_list);
        INIT_LIST_HEAD(&bp->b_hash_list);
        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
@@ -838,6 +838,7 @@ xfs_buf_rele(
                return;
        }
+        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
@@ -851,11 +852,6 @@ xfs_buf_rele(
                        spin_unlock(&hash->bh_lock);
                        xfs_buf_free(bp);
                }
-        } else {
-                /*
-                 * Catch reference count leaks
-                 */
-                ASSERT(atomic_read(&bp->b_hold) >= 0);
        }
 }
@@ -1005,12 +1001,13 @@ xfs_buf_iodone_work(
         * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
         * ordered flag and reissue them.  Because we can't tell the higher
         * layers directly that they should not issue ordered I/O anymore, they
-         * need to check if the ordered flag was cleared during I/O completion.
+         * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
         */
        if ((bp->b_error == EOPNOTSUPP) &&
            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
                XB_TRACE(bp, "ordered_retry", bp->b_iodone);
                bp->b_flags &= ~XBF_ORDERED;
+                bp->b_flags |= _XFS_BARRIER_FAILED;
                xfs_buf_iorequest(bp);
        } else if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
@@ -1037,7 +1034,7 @@ xfs_buf_ioend(
                        xfs_buf_iodone_work(&bp->b_iodone_work);
                }
        } else {
-                up(&bp->b_iodonesema);
+                complete(&bp->b_iowait);
        }
 }
@@ -1275,7 +1272,7 @@ xfs_buf_iowait(
        XB_TRACE(bp, "iowait", 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
-        down(&bp->b_iodonesema);
+        wait_for_completion(&bp->b_iowait);
        XB_TRACE(bp, "iowaited", (long)bp->b_error);
        return bp->b_error;
 }
@@ -1799,7 +1796,7 @@ int __init
 xfs_buf_init(void)
 {
 #ifdef XFS_BUF_TRACE
-        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
+        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
 #endif
        xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 29d1d4adc078..456519a088c7 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
         * modifications being lost.
         */
        _XBF_PAGE_LOCKED = (1 << 22),
+        /*
+         * If we try a barrier write, but it fails we have to communicate
+         * this to the upper layers.  Unfortunately b_error gets overwritten
+         * when the buffer is re-issued so we have to add another flag to
+         * keep this information.
+         */
+        _XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
 typedef enum {
@@ -157,7 +165,7 @@ typedef struct xfs_buf {
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        xfs_buf_relse_t         b_relse;        /* releasing function */
        xfs_buf_bdstrat_t       b_strat;        /* pre-write function */
-        struct semaphore        b_iodonesema;   /* Semaphore for I/O waiters */
+        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
        void                    *b_fspriv3;
@@ -352,7 +360,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_CPSEMA(bp)      (xfs_buf_cond_lock(bp) == 0)
 #define XFS_BUF_VSEMA(bp)       xfs_buf_unlock(bp)
 #define XFS_BUF_PSEMA(bp,x)     xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
+#define XFS_BUF_FINISH_IOWAIT(bp)       complete(&bp->b_iowait);
 #define XFS_BUF_SET_TARGET(bp, target)  ((bp)->b_target = (target))
 #define XFS_BUF_TARGET(bp)              ((bp)->b_target)
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 987fe84f7b13..24fd598af846 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -139,7 +139,7 @@ xfs_nfs_get_inode(
        }
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return ip->i_vnode;
+        return VFS_I(ip);
 }
 STATIC struct dentry *
@@ -167,7 +167,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
        if (!inode)
                return NULL;
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return ERR_CAST(inode);
        result = d_alloc_anon(inode);
        if (!result) {
                iput(inode);
@@ -198,7 +198,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
        if (!inode)
                return NULL;
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return ERR_CAST(inode);
        result = d_alloc_anon(inode);
        if (!result) {
                iput(inode);
@@ -219,9 +219,9 @@ xfs_fs_get_parent(
        if (unlikely(error))
                return ERR_PTR(-error);
-        parent = d_alloc_anon(cip->i_vnode);
+        parent = d_alloc_anon(VFS_I(cip));
        if (unlikely(!parent)) {
-                iput(cip->i_vnode);
+                iput(VFS_I(cip));
                return ERR_PTR(-ENOMEM);
        }
        return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5f60363b9343..5311c1acdd40 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = {
 const struct file_operations xfs_dir_file_operations = {
        .read           = generic_read_dir,
        .readdir        = xfs_file_readdir,
+        .llseek         = generic_file_llseek,
        .unlocked_ioctl = xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = xfs_file_compat_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1eefe61f0e10..36caa6d957df 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -31,7 +31,7 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        if (mapping->nrpages)
                truncate_inode_pages(mapping, first);
@@ -44,7 +44,7 @@ xfs_flushinval_pages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
        if (mapping->nrpages) {
@@ -64,7 +64,7 @@ xfs_flush_pages(
        uint64_t        flags,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
        int             ret2;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index acb978d9d085..48799ba7e3e6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        *inode = XFS_ITOV(ip);
+        *inode = VFS_I(ip);
        return 0;
 }
@@ -927,7 +927,7 @@ STATIC void
 xfs_diflags_to_linux(
        struct xfs_inode        *ip)
 {
-        struct inode            *inode = XFS_ITOV(ip);
+        struct inode            *inode = VFS_I(ip);
        unsigned int            xflags = xfs_ip2xflags(ip);
        if (xflags & XFS_XFLAG_IMMUTABLE)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e88f51028086..095d271f3434 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,7 +62,7 @@ void
 xfs_synchronize_atime(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        if (inode) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
@@ -79,7 +79,7 @@ void
 xfs_mark_inode_dirty_sync(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        if (inode)
                mark_inode_dirty_sync(inode);
@@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync(
 * Change the requested timestamp in the given inode.
 * We don't lock across timestamp updates, and we don't log them but
 * we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- *              with XFS_ICHGTIME_ACC to be sure that access time
- *              update will take.  Calling first with XFS_ICHGTIME_ACC
- *              and then XFS_ICHGTIME_MOD may fail to modify the access
- *              timestamp if the filesystem is mounted noacctm.
 */
 void
 xfs_ichgtime(
        xfs_inode_t     *ip,
        int             flags)
 {
-        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
+        struct inode    *inode = VFS_I(ip);
        timespec_t      tv;
+        int             sync_it = 0;
+        tv = current_fs_time(inode->i_sb);
-        nanotime(&tv);
+        if ((flags & XFS_ICHGTIME_MOD) &&
-        if (flags & XFS_ICHGTIME_MOD) {
+            !timespec_equal(&inode->i_mtime, &tv)) {
                inode->i_mtime = tv;
                ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
                ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+                sync_it = 1;
        }
-        if (flags & XFS_ICHGTIME_ACC) {
+        if ((flags & XFS_ICHGTIME_CHG) &&
-                inode->i_atime = tv;
+            !timespec_equal(&inode->i_ctime, &tv)) {
-                ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
-                ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
-        }
-        if (flags & XFS_ICHGTIME_CHG) {
                inode->i_ctime = tv;
                ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
                ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+                sync_it = 1;
        }
        /*
@@ -130,55 +125,11 @@ xfs_ichgtime(
         * ensure that the compiler does not reorder the update
         * of i_update_core above the timestamp updates above.
         */
-        SYNCHRONIZE();
+        if (sync_it) {
-        ip->i_update_core = 1;
+                SYNCHRONIZE();
-        if (!(inode->i_state & I_NEW))
+                ip->i_update_core = 1;
                mark_inode_dirty_sync(inode);
-}
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
-        xfs_inode_t     *ip,
-        struct inode    *inode,
-        int             flags)
-{
-        timespec_t      *tvp;
-        /*
-         * Atime updates for read() & friends are handled lazily now, and
-         * explicit updates must go through xfs_ichgtime()
-         */
-        ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-        if (flags & XFS_ICHGTIME_MOD) {
-                tvp = &inode->i_mtime;
-                ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
-                ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
        }
-        if (flags & XFS_ICHGTIME_CHG) {
-                tvp = &inode->i_ctime;
-                ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
-                ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
-        }
-        /*
-         * We update the i_update_core field _after_ changing
-         * the timestamps in order to coordinate properly with
-         * xfs_iflush() so that we don't lose timestamp updates.
-         * This keeps us from having to hold the inode lock
-         * while doing this.  We use the SYNCHRONIZE macro to
-         * ensure that the compiler does not reorder the update
-         * of i_update_core above the timestamp updates above.
-         */
-        SYNCHRONIZE();
-        ip->i_update_core = 1;
-        if (!(inode->i_state & I_NEW))
-                mark_inode_dirty_sync(inode);
 }
 /*
@@ -299,7 +250,7 @@ xfs_vn_mknod(
        if (unlikely(error))
                goto out_free_acl;
-        inode = ip->i_vnode;
+        inode = VFS_I(ip);
        error = xfs_init_security(inode, dir);
        if (unlikely(error))
@@ -366,7 +317,7 @@ xfs_vn_lookup(
                return NULL;
        }
-        return d_splice_alias(cip->i_vnode, dentry);
+        return d_splice_alias(VFS_I(cip), dentry);
 }
 STATIC struct dentry *
@@ -399,12 +350,12 @@ xfs_vn_ci_lookup(
        /* if exact match, just splice and exit */
        if (!ci_name.name)
-                return d_splice_alias(ip->i_vnode, dentry);
+                return d_splice_alias(VFS_I(ip), dentry);
        /* else case-insensitive match... */
        dname.name = ci_name.name;
        dname.len = ci_name.len;
-        dentry = d_add_ci(ip->i_vnode, dentry, &dname);
+        dentry = d_add_ci(dentry, VFS_I(ip), &dname);
        kmem_free(ci_name.name);
        return dentry;
 }
@@ -478,7 +429,7 @@ xfs_vn_symlink(
        if (unlikely(error))
                goto out;
-        inode = cip->i_vnode;
+        inode = VFS_I(cip);
        error = xfs_init_security(inode, dir);
        if (unlikely(error))
@@ -710,7 +661,7 @@ out_error:
        return error;
 }
-const struct inode_operations xfs_inode_operations = {
+static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
        .getattr                = xfs_vn_getattr,
@@ -722,7 +673,7 @@ const struct inode_operations xfs_inode_operations = {
        .fallocate              = xfs_vn_fallocate,
 };
-const struct inode_operations xfs_dir_inode_operations = {
+static const struct inode_operations xfs_dir_inode_operations = {
        .create                 = xfs_vn_create,
        .lookup                 = xfs_vn_lookup,
        .link                   = xfs_vn_link,
@@ -747,7 +698,7 @@ const struct inode_operations xfs_dir_inode_operations = {
        .listxattr              = xfs_vn_listxattr,
 };
-const struct inode_operations xfs_dir_ci_inode_operations = {
+static const struct inode_operations xfs_dir_ci_inode_operations = {
        .create                 = xfs_vn_create,
        .lookup                 = xfs_vn_ci_lookup,
        .link                   = xfs_vn_link,
@@ -772,7 +723,7 @@ const struct inode_operations xfs_dir_ci_inode_operations = {
        .listxattr              = xfs_vn_listxattr,
 };
-const struct inode_operations xfs_symlink_inode_operations = {
+static const struct inode_operations xfs_symlink_inode_operations = {
        .readlink               = generic_readlink,
        .follow_link            = xfs_vn_follow_link,
        .put_link               = xfs_vn_put_link,
@@ -784,3 +735,98 @@ const struct inode_operations xfs_symlink_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
 };
+STATIC void
+xfs_diflags_to_iflags(
+        struct inode            *inode,
+        struct xfs_inode        *ip)
+{
+        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+                inode->i_flags |= S_SYNC;
+        else
+                inode->i_flags &= ~S_SYNC;
+        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        else
+                inode->i_flags &= ~S_NOATIME;
+}
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ */
+void
+xfs_setup_inode(
+        struct xfs_inode        *ip)
+{
+        struct inode            *inode = ip->i_vnode;
+        inode->i_mode   = ip->i_d.di_mode;
+        inode->i_nlink  = ip->i_d.di_nlink;
+        inode->i_uid    = ip->i_d.di_uid;
+        inode->i_gid    = ip->i_d.di_gid;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_rdev =
+                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+                              sysv_minor(ip->i_df.if_u2.if_rdev));
+                break;
+        default:
+                inode->i_rdev = 0;
+                break;
+        }
+        inode->i_generation = ip->i_d.di_gen;
+        i_size_write(inode, ip->i_d.di_size);
+        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
+        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
+        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
+        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
+        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
+        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
+        xfs_diflags_to_iflags(inode, ip);
+        xfs_iflags_clear(ip, XFS_IMODIFIED);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_op = &xfs_inode_operations;
+                inode->i_fop = &xfs_file_operations;
+                inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        case S_IFDIR:
+                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                        inode->i_op = &xfs_dir_ci_inode_operations;
+                else
+                        inode->i_op = &xfs_dir_inode_operations;
+                inode->i_fop = &xfs_dir_file_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &xfs_symlink_inode_operations;
+                if (!(ip->i_df.if_flags & XFS_IFINLINE))
+                        inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        default:
+                inode->i_op = &xfs_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        }
+        xfs_iflags_clear(ip, XFS_INEW);
+        barrier();
+        unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index d97ba934a2ac..8b1a1e31dc21 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -18,10 +18,7 @@
 #ifndef __XFS_IOPS_H__
 #define __XFS_IOPS_H__
-extern const struct inode_operations xfs_inode_operations;
+struct xfs_inode;
-extern const struct inode_operations xfs_dir_inode_operations;
-extern const struct inode_operations xfs_dir_ci_inode_operations;
-extern const struct inode_operations xfs_symlink_inode_operations;
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
@@ -29,14 +26,6 @@ extern const struct file_operations xfs_invis_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-struct xfs_inode;
+extern void xfs_setup_inode(struct xfs_inode *);
-extern void xfs_ichgtime(struct xfs_inode *, int);
-extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int);
-#define xfs_vtoi(vp) \
-        ((struct xfs_inode *)vn_to_inode(vp)->i_private)
-#define XFS_I(inode) \
-        ((struct xfs_inode *)(inode)->i_private)
 #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 4d45d9351a6c..cc0f7b3a9795 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -45,13 +45,13 @@
 #include <mrlock.h>
 #include <sv.h>
 #include <mutex.h>
-#include <sema.h>
 #include <time.h>
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/uuid.h>
+#include <linux/semaphore.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
@@ -126,8 +126,6 @@
 #define current_cpu()           (raw_smp_processor_id())
 #define current_pid()           (current->pid)
-#define current_fsuid(cred)     (current->fsuid)
-#define current_fsgid(cred)     (current->fsgid)
 #define current_test_flags(f)   (current->flags & (f))
 #define current_set_flags_nested(sp, f)         \
                (*(sp) = current->flags, current->flags |= (f))
@@ -180,7 +178,7 @@
 #define xfs_sort(a,n,s,fn)      sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()       dump_stack()
 #define xfs_itruncate_data(ip, off)     \
-        (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off)))
+        (-vmtruncate(VFS_I(ip), (off)))
 /* Move the kernel do_div definition off to one side */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 82333b3e118e..1957e5357d04 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -137,7 +137,7 @@ xfs_iozero(
        struct address_space    *mapping;
        int                     status;
-        mapping = ip->i_vnode->i_mapping;
+        mapping = VFS_I(ip)->i_mapping;
        do {
                unsigned offset, bytes;
                void *fsdata;
@@ -674,9 +674,7 @@ start:
         */
        if (likely(!(ioflags & IO_INVIS) &&
                   !mnt_want_write(file->f_path.mnt))) {
-                file_update_time(file);
+                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                xfs_ichgtime_fast(xip, inode,
-                                  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                mnt_drop_write(file->f_path.mnt);
        }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 30ae96397e31..7227b2efef22 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -158,7 +158,7 @@ enum {
        Opt_barrier, Opt_nobarrier, Opt_err
 };
-static match_table_t tokens = {
+static const match_table_t tokens = {
        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_err, NULL}
@@ -581,118 +581,6 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
-STATIC_INLINE void
-xfs_set_inodeops(
-        struct inode            *inode)
-{
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFREG:
-                inode->i_op = &xfs_inode_operations;
-                inode->i_fop = &xfs_file_operations;
-                inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        case S_IFDIR:
-                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
-                        inode->i_op = &xfs_dir_ci_inode_operations;
-                else
-                        inode->i_op = &xfs_dir_inode_operations;
-                inode->i_fop = &xfs_dir_file_operations;
-                break;
-        case S_IFLNK:
-                inode->i_op = &xfs_symlink_inode_operations;
-                if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE))
-                        inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        default:
-                inode->i_op = &xfs_inode_operations;
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                break;
-        }
-}
-STATIC_INLINE void
-xfs_revalidate_inode(
-        xfs_mount_t             *mp,
-        bhv_vnode_t             *vp,
-        xfs_inode_t             *ip)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        inode->i_mode   = ip->i_d.di_mode;
-        inode->i_nlink  = ip->i_d.di_nlink;
-        inode->i_uid    = ip->i_d.di_uid;
-        inode->i_gid    = ip->i_d.di_gid;
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFBLK:
-        case S_IFCHR:
-                inode->i_rdev =
-                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-                              sysv_minor(ip->i_df.if_u2.if_rdev));
-                break;
-        default:
-                inode->i_rdev = 0;
-                break;
-        }
-        inode->i_generation = ip->i_d.di_gen;
-        i_size_write(inode, ip->i_d.di_size);
-        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
-        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
-        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
-        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
-        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
-        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
-        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
-                inode->i_flags |= S_IMMUTABLE;
-        else
-                inode->i_flags &= ~S_IMMUTABLE;
-        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
-                inode->i_flags |= S_APPEND;
-        else
-                inode->i_flags &= ~S_APPEND;
-        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
-                inode->i_flags |= S_SYNC;
-        else
-                inode->i_flags &= ~S_SYNC;
-        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
-                inode->i_flags |= S_NOATIME;
-        else
-                inode->i_flags &= ~S_NOATIME;
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
-}
-void
-xfs_initialize_vnode(
-        struct xfs_mount        *mp,
-        bhv_vnode_t             *vp,
-        struct xfs_inode        *ip)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        if (!ip->i_vnode) {
-                ip->i_vnode = vp;
-                inode->i_private = ip;
-        }
-        /*
-         * We need to set the ops vectors, and unlock the inode, but if
-         * we have been called during the new inode create process, it is
-         * too early to fill in the Linux inode.  We will get called a
-         * second time once the inode is properly set up, and then we can
-         * finish our work.
-         */
-        if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) {
-                xfs_revalidate_inode(mp, vp, ip);
-                xfs_set_inodeops(inode);
-                xfs_iflags_clear(ip, XFS_INEW);
-                barrier();
-                unlock_new_inode(inode);
-        }
-}
 int
 xfs_blkdev_get(
        xfs_mount_t             *mp,
@@ -982,26 +870,21 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        bhv_vnode_t             *vp;
+        return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-        vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-        if (unlikely(!vp))
-                return NULL;
-        return vn_to_inode(vp);
 }
 STATIC void
 xfs_fs_destroy_inode(
        struct inode            *inode)
 {
-        kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode));
+        kmem_zone_free(xfs_vnode_zone, inode);
 }
 STATIC void
 xfs_fs_inode_init_once(
        void                    *vnode)
 {
-        inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
+        inode_init_once((struct inode *)vnode);
 }
 /*
@@ -1106,7 +989,7 @@ void
 xfs_flush_inode(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
@@ -1131,7 +1014,7 @@ void
 xfs_flush_device(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
+        struct inode    *inode = VFS_I(ip);
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
@@ -1201,6 +1084,15 @@ xfssyncd(
 }
 STATIC void
+xfs_free_fsname(
+        struct xfs_mount        *mp)
+{
+        kfree(mp->m_fsname);
+        kfree(mp->m_rtname);
+        kfree(mp->m_logname);
+}
+STATIC void
 xfs_fs_put_super(
        struct super_block      *sb)
 {
@@ -1239,8 +1131,6 @@ xfs_fs_put_super(
        error = xfs_unmount_flush(mp, 0);
        WARN_ON(error);
-        IRELE(rip);
        /*
         * If we're forcing a shutdown, typically because of a media error,
         * we want to make sure we invalidate dirty pages that belong to
@@ -1257,10 +1147,12 @@ xfs_fs_put_super(
        }
        xfs_unmountfs(mp);
+        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
        xfs_qmops_put(mp);
        xfs_dmops_put(mp);
+        xfs_free_fsname(mp);
        kfree(mp);
 }
@@ -1410,9 +1302,29 @@ xfs_fs_remount(
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                        break;
                default:
+                        /*
+                         * Logically we would return an error here to prevent
+                         * users from believing they might have changed
+                         * mount options using remount which can't be changed.
+                         *
+                         * But unfortunately mount(8) adds all options from
+                         * mtab and fstab to the mount arguments in some cases
+                         * so we can't blindly reject options, but have to
+                         * check for each specified option if it actually
+                         * differs from the currently set option and only
+                         * reject it if that's the case.
+                         *
+                         * Until that is implemented we return success for
+                         * every remount request, and silently ignore all
+                         * options that we can't actually change.
+                         */
+#if 0
                        printk(KERN_INFO
        "XFS: mount option \"%s\" not supported for remount\n", p);
                        return -EINVAL;
+#else
+                        return 0;
+#endif
                }
        }
@@ -1517,6 +1429,8 @@ xfs_start_flags(
        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
+        int                     error;
        /* Values are in BBs */
        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
                /*
@@ -1549,17 +1463,27 @@ xfs_start_flags(
                        ap->logbufsize);
                return XFS_ERROR(EINVAL);
        }
+        error = ENOMEM;
        mp->m_logbsize = ap->logbufsize;
        mp->m_fsname_len = strlen(ap->fsname) + 1;
-        mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
-        strcpy(mp->m_fsname, ap->fsname);
+        mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
+        if (!mp->m_fsname)
+                goto out;
        if (ap->rtname[0]) {
-                mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP);
+                mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-                strcpy(mp->m_rtname, ap->rtname);
+                if (!mp->m_rtname)
+                        goto out_free_fsname;
        }
        if (ap->logname[0]) {
-                mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP);
+                mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-                strcpy(mp->m_logname, ap->logname);
+                if (!mp->m_logname)
+                        goto out_free_rtname;
        }
        if (ap->flags & XFSMNT_WSYNC)
@@ -1632,6 +1556,14 @@ xfs_start_flags(
        if (ap->flags & XFSMNT_DMAPI)
                mp->m_flags |= XFS_MOUNT_DMAPI;
        return 0;
+ out_free_rtname:
+        kfree(mp->m_rtname);
+ out_free_fsname:
+        kfree(mp->m_fsname);
+ out:
+        return error;
 }
 /*
@@ -1792,10 +1724,10 @@ xfs_fs_fill_super(
         */
        error = xfs_start_flags(args, mp);
        if (error)
-                goto out_destroy_counters;
+                goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-                goto out_destroy_counters;
+                goto out_free_fsname;
        error = xfs_finish_flags(args, mp);
        if (error)
                goto out_free_sb;
@@ -1811,7 +1743,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_sb;
-        error = xfs_mountfs(mp, flags);
+        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
@@ -1825,7 +1757,7 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        root = igrab(mp->m_rootip->i_vnode);
+        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = ENOENT;
                goto fail_unmount;
@@ -1857,7 +1789,8 @@ xfs_fs_fill_super(
        xfs_filestream_unmount(mp);
 out_free_sb:
        xfs_freesb(mp);
- out_destroy_counters:
+ out_free_fsname:
+        xfs_free_fsname(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
 out_put_qmops:
@@ -1890,10 +1823,8 @@ xfs_fs_fill_super(
        error = xfs_unmount_flush(mp, 0);
        WARN_ON(error);
-        IRELE(mp->m_rootip);
        xfs_unmountfs(mp);
-        goto out_destroy_counters;
+        goto out_free_sb;
 }
 STATIC int
@@ -2014,7 +1945,7 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
+        xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
                                        KM_ZONE_SPREAD,
                                        xfs_fs_inode_init_once);
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index b7d13da01bd6..fe2ef4e6a0f9 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -101,9 +101,6 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp,
-                struct xfs_inode *ip);
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 25488b6d9881..b52528bbbfff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -33,7 +33,7 @@
 /*
- * Dedicated vnode inactive/reclaim sync semaphores.
+ * Dedicated vnode inactive/reclaim sync wait queues.
 * Prime number of hash buckets since address is used as the key.
 */
 #define NVSYNC                  37
@@ -82,24 +82,6 @@ vn_ioerror(
                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
 }
-/*
- * Add a reference to a referenced vnode.
- */
-bhv_vnode_t *
-vn_hold(
-        bhv_vnode_t     *vp)
-{
-        struct inode    *inode;
-        XFS_STATS_INC(vn_hold);
-        inode = igrab(vn_to_inode(vp));
-        ASSERT(inode);
-        return vp;
-}
 #ifdef  XFS_INODE_TRACE
 /*
@@ -108,7 +90,7 @@ vn_hold(
 */
 static inline int xfs_icount(struct xfs_inode *ip)
 {
-        bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
+        struct inode *vp = VFS_I(ip);
        if (vp)
                return vn_count(vp);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 41ca2cec5d31..683ce16210ff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -22,20 +22,6 @@ struct file;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
-typedef struct inode    bhv_vnode_t;
-/*
- * Vnode to Linux inode mapping.
- */
-static inline bhv_vnode_t *vn_from_inode(struct inode *inode)
-{
-        return inode;
-}
-static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
-{
-        return vnode;
-}
 /*
 * Return values for xfs_inactive.  A return value of
 * VN_INACTIVE_NOCACHE implies that the file system behavior
@@ -76,57 +62,52 @@ extern void	vn_iowait(struct xfs_inode *ip);
 extern void     vn_iowake(struct xfs_inode *ip);
 extern void     vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(bhv_vnode_t *vp)
+static inline int vn_count(struct inode *vp)
 {
-        return atomic_read(&vn_to_inode(vp)->i_count);
+        return atomic_read(&vp->i_count);
 }
-/*
+#define IHOLD(ip) \
- * Vnode reference counting functions (and macros for compatibility).
+do { \
- */
+        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-extern bhv_vnode_t      *vn_hold(bhv_vnode_t *);
+        atomic_inc(&(VFS_I(ip)->i_count)); \
+        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
-#if defined(XFS_INODE_TRACE)
+#define IRELE(ip) \
-#define VN_HOLD(vp)             \
+do { \
-        ((void)vn_hold(vp),     \
+        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-          xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address))
+        iput(VFS_I(ip)); \
-#define VN_RELE(vp)             \
+} while (0)
-          (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \
-           iput(vn_to_inode(vp)))
-#else
-#define VN_HOLD(vp)             ((void)vn_hold(vp))
-#define VN_RELE(vp)             (iput(vn_to_inode(vp)))
-#endif
-static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
+static inline struct inode *vn_grab(struct inode *vp)
 {
-        struct inode *inode = igrab(vn_to_inode(vp));
+        return igrab(vp);
-        return inode ? vn_from_inode(inode) : NULL;
 }
 /*
 * Dealing with bad inodes
 */
-static inline int VN_BAD(bhv_vnode_t *vp)
+static inline int VN_BAD(struct inode *vp)
 {
-        return is_bad_inode(vn_to_inode(vp));
+        return is_bad_inode(vp);
 }
 /*
 * Extracting atime values in various formats
 */
-static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
 {
        bs_atime->tv_sec = vp->i_atime.tv_sec;
        bs_atime->tv_nsec = vp->i_atime.tv_nsec;
 }
-static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
 {
        *ts = vp->i_atime;
 }
-static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
+static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
 {
        *tt = vp->i_atime.tv_sec;
 }
@@ -134,9 +115,9 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 /*
 * Some useful predicates.
 */
-#define VN_MAPPED(vp)   mapping_mapped(vn_to_inode(vp)->i_mapping)
+#define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)   (vn_to_inode(vp)->i_mapping->nrpages)
+#define VN_CACHED(vp)   (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)    mapping_tagged(vn_to_inode(vp)->i_mapping, \
+#define VN_DIRTY(vp)    mapping_tagged(vp->i_mapping, \
                                        PAGECACHE_TAG_DIRTY)
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index fc9f3fb39b7b..f2705f2fd43c 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,11 +101,18 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-                initnsema(&dqp->q_flock, 1, "fdq");
                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+                /*
+                 * Because we want to use a counting completion, complete
+                 * the flush completion once to allow a single access to
+                 * the flush completion without blocking.
+                 */
+                init_completion(&dqp->q_flush);
+                complete(&dqp->q_flush);
 #ifdef XFS_DQUOT_TRACE
-                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
                xfs_dqtrace_entry(dqp, "DQINIT");
 #endif
        } else {
@@ -150,7 +157,6 @@ xfs_qm_dqdestroy(
        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
        mutex_destroy(&dqp->q_qlock);
-        freesema(&dqp->q_flock);
        sv_destroy(&dqp->q_pinwait);
 #ifdef XFS_DQUOT_TRACE
@@ -431,7 +437,7 @@ xfs_qm_dqalloc(
         * when it unlocks the inode. Since we want to keep the quota
         * inode around, we bump the vnode ref count now.
         */
-        VN_HOLD(XFS_ITOV(quotip));
+        IHOLD(quotip);
        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
        nmaps = 1;
@@ -1211,7 +1217,7 @@ xfs_qm_dqflush(
        int                     error;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        ASSERT(!completion_done(&dqp->q_flush));
        xfs_dqtrace_entry(dqp, "DQFLUSH");
        /*
@@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done(
        xfs_dqfunlock(dqp);
 }
-int
-xfs_qm_dqflock_nowait(
-        xfs_dquot_t *dqp)
-{
-        int locked;
-        locked = cpsema(&((dqp)->q_flock));
-        /* XXX ifdef these out */
-        if (locked)
-                (dqp)->dq_flags |= XFS_DQ_FLOCKED;
-        return (locked);
-}
 int
 xfs_qm_dqlock_nowait(
        xfs_dquot_t *dqp)
 {
-        return (mutex_trylock(&((dqp)->q_qlock)));
+        return mutex_trylock(&dqp->q_qlock);
 }
 void
 xfs_dqlock(
        xfs_dquot_t *dqp)
 {
-        mutex_lock(&(dqp->q_qlock));
+        mutex_lock(&dqp->q_qlock);
 }
 void
@@ -1468,7 +1458,7 @@ xfs_qm_dqpurge(
         * if we're turning off quotas. Basically, we need this flush
         * lock, and are willing to block on it.
         */
-        if (! xfs_qm_dqflock_nowait(dqp)) {
+        if (!xfs_dqflock_nowait(dqp)) {
                /*
                 * Block on the flush lock after nudging dquot buffer,
                 * if it is incore.
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index f7393bba4e95..8958d0faf8d3 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -82,7 +82,7 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
-        sema_t           q_flock;       /* flush lock */
+        struct completion q_flush;      /* flush completion queue */
        uint             q_pincount;    /* pin count for this dquot */
        sv_t             q_pinwait;     /* sync var for pinning */
 #ifdef XFS_DQUOT_TRACE
@@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 /*
- * The following three routines simply manage the q_flock
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
- * semaphore embedded in the dquot.  This semaphore synchronizes
+ * queue synchronizes processes attempting to flush the in-core dquot back to
- * processes attempting to flush the in-core dquot back to disk.
+ * disk.
 */
-#define xfs_dqflock(dqp)         { psema(&((dqp)->q_flock), PINOD | PRECALC);\
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
-                                   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
+{
-#define xfs_dqfunlock(dqp)       { ASSERT(issemalocked(&((dqp)->q_flock))); \
+        wait_for_completion(&dqp->q_flush);
-                                   vsema(&((dqp)->q_flock)); \
+}
-                                   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+        return try_wait_for_completion(&dqp->q_flush);
+}
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+        complete(&dqp->q_flush);
+}
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
@@ -167,7 +175,6 @@ extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
 extern int              xfs_qm_dqpurge(xfs_dquot_t *);
 extern void             xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern int              xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int              xfs_qm_dqflock_nowait(xfs_dquot_t *);
 extern void             xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
 extern void             xfs_qm_adjust_dqtimers(xfs_mount_t *,
                                        xfs_disk_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 08d2fc89e6a1..f028644caa5e 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push(
        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        ASSERT(!completion_done(&dqp->q_flush));
        /*
         * Since we were able to lock the dquot's flush lock and
@@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if (!issemalocked(&(dqp->q_flock))  ||
+        if (completion_done(&dqp->q_flush)  ||
            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                qip->qli_pushbuf_flag = 0;
                xfs_dqunlock(dqp);
@@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf(
        if (bp != NULL) {
                if (XFS_BUF_ISDELAYWRITE(bp)) {
                        dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-                                  issemalocked(&(dqp->q_flock)));
+                                  !completion_done(&dqp->q_flush));
                        qip->qli_pushbuf_flag = 0;
                        xfs_dqunlock(dqp);
@@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock(
                return (XFS_ITEM_LOCKED);
        retval = XFS_ITEM_SUCCESS;
-        if (! xfs_qm_dqflock_nowait(dqp)) {
+        if (!xfs_dqflock_nowait(dqp)) {
                /*
                 * The dquot is already being flushed.  It may have been
                 * flushed delayed write, however, and we don't want to
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 021934a3d456..df0ffef9775a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy(
 */
 void
 xfs_qm_mount_quotas(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             mfsi_flags)
 {
        int             error = 0;
        uint            sbf;
@@ -346,8 +345,7 @@ xfs_qm_mount_quotas(
        /*
         * If any of the quotas are not consistent, do a quotacheck.
         */
-        if (XFS_QM_NEED_QUOTACHECK(mp) &&
+        if (XFS_QM_NEED_QUOTACHECK(mp)) {
-            !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
                error = xfs_qm_quotacheck(mp);
                if (error) {
                        /* Quotacheck failed and disabled quotas. */
@@ -484,7 +482,7 @@ again:
                xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
                         * to see if the dquot has been flushed delayed
@@ -1062,7 +1060,7 @@ xfs_qm_sync(
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        if (nowait) {
                                xfs_dqunlock(dqp);
                                continue;
@@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist(
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
                        dqp = dqp->dq_flnext;
                        continue;
@@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void)
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
                        continue;
                }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index cd2300e374af..44f25349e478 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void             xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f4f6c4c861d7..eea2e60b456b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -162,7 +162,7 @@ xfs_qm_newmount(
                         * mounting, and get on with the boring life
                         * without disk quotas.
                         */
-                        xfs_qm_mount_quotas(mp, 0);
+                        xfs_qm_mount_quotas(mp);
                } else {
                        /*
                         * Clear the quota flags, but remember them. This
@@ -184,13 +184,12 @@ STATIC int
 xfs_qm_endmount(
        xfs_mount_t     *mp,
        uint            needquotamount,
-        uint            quotaflags,
+        uint            quotaflags)
-        int             mfsi_flags)
 {
        if (needquotamount) {
                ASSERT(mp->m_qflags == 0);
                mp->m_qflags = quotaflags;
-                xfs_qm_mount_quotas(mp, mfsi_flags);
+                xfs_qm_mount_quotas(mp);
        }
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index adfb8723f65a..1a3b803dfa55 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes(
 {
        xfs_inode_t     *ip, *topino;
        uint            ireclaims;
-        bhv_vnode_t     *vp;
+        struct inode    *vp;
        boolean_t       vnode_refd;
        ASSERT(mp->m_quotainfo);
@@ -1059,7 +1059,7 @@ again:
                        ip = ip->i_mnext;
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                vp = VFS_I(ip);
                if (!vp) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 3e4648ad9cfc..b2f639a1416f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,15 +37,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
-STATIC int      xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
+STATIC int      xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void     xfs_acl_get_endian(xfs_acl_t *);
 STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int      xfs_acl_invalid(xfs_acl_t *);
 STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void     xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
-STATIC void     xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
+STATIC void     xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
-STATIC int      xfs_acl_allow_set(bhv_vnode_t *, int);
+STATIC int      xfs_acl_allow_set(struct inode *, int);
 kmem_zone_t *xfs_acl_zone;
@@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
 */
 int
 xfs_acl_vhasacl_access(
-        bhv_vnode_t     *vp)
+        struct inode    *vp)
 {
        int             error;
@@ -68,7 +68,7 @@ xfs_acl_vhasacl_access(
 */
 int
 xfs_acl_vhasacl_default(
-        bhv_vnode_t     *vp)
+        struct inode    *vp)
 {
        int             error;
@@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr(
 int
 xfs_acl_vget(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        void            *acl,
        size_t          size,
        int             kind)
@@ -217,7 +217,6 @@ xfs_acl_vget(
        posix_acl_xattr_header  *ext_acl = acl;
        int                     flags = 0;
-        VN_HOLD(vp);
        if(size) {
                if (!(_ACL_ALLOC(xfs_acl))) {
                        error = ENOMEM;
@@ -239,11 +238,10 @@ xfs_acl_vget(
                        goto out;
                }
                if (kind == _ACL_TYPE_ACCESS)
-                        xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl);
+                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
                error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
        }
 out:
-        VN_RELE(vp);
        if(xfs_acl)
                _ACL_FREE(xfs_acl);
        return -error;
@@ -251,28 +249,26 @@ out:
 int
 xfs_acl_vremove(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        int             kind)
 {
        int             error;
-        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        if (!error) {
-                error = xfs_attr_remove(xfs_vtoi(vp),
+                error = xfs_attr_remove(XFS_I(vp),
                                                kind == _ACL_TYPE_DEFAULT?
                                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
                                                ATTR_ROOT);
                if (error == ENOATTR)
                        error = 0;      /* 'scool */
        }
-        VN_RELE(vp);
        return -error;
 }
 int
 xfs_acl_vset(
-        bhv_vnode_t             *vp,
+        struct inode            *vp,
        void                    *acl,
        size_t                  size,
        int                     kind)
@@ -298,7 +294,6 @@ xfs_acl_vset(
                return 0;
        }
-        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        /* Incoming ACL exists, set file mode based on its value */
@@ -321,7 +316,6 @@ xfs_acl_vset(
        }
 out:
-        VN_RELE(vp);
        _ACL_FREE(xfs_acl);
        return -error;
 }
@@ -363,7 +357,7 @@ xfs_acl_iaccess(
 STATIC int
 xfs_acl_allow_set(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        int             kind)
 {
        if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
@@ -372,7 +366,7 @@ xfs_acl_allow_set(
                return ENOTDIR;
        if (vp->i_sb->s_flags & MS_RDONLY)
                return EROFS;
-        if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+        if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
                return EPERM;
        return 0;
 }
@@ -566,7 +560,7 @@ xfs_acl_get_endian(
 */
 STATIC void
 xfs_acl_get_attr(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             flags,
@@ -576,7 +570,7 @@ xfs_acl_get_attr(
        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
        flags |= ATTR_ROOT;
-        *error = xfs_attr_get(xfs_vtoi(vp),
+        *error = xfs_attr_get(XFS_I(vp),
                                        kind == _ACL_TYPE_ACCESS ?
                                        SGI_ACL_FILE : SGI_ACL_DEFAULT,
                                        (char *)aclp, &len, flags);
@@ -590,7 +584,7 @@ xfs_acl_get_attr(
 */
 STATIC void
 xfs_acl_set_attr(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             *error)
@@ -615,7 +609,7 @@ xfs_acl_set_attr(
                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
        }
        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        *error = xfs_attr_set(xfs_vtoi(vp),
+        *error = xfs_attr_set(XFS_I(vp),
                                kind == _ACL_TYPE_ACCESS ?
                                SGI_ACL_FILE: SGI_ACL_DEFAULT,
                                (char *)newacl, len, ATTR_ROOT);
@@ -624,7 +618,7 @@ xfs_acl_set_attr(
 int
 xfs_acl_vtoacl(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *access_acl,
        xfs_acl_t       *default_acl)
 {
@@ -639,7 +633,7 @@ xfs_acl_vtoacl(
                if (error)
                        access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
                else /* We have a good ACL and the file mode, synchronize. */
-                        xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl);
+                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
        }
        if (default_acl) {
@@ -656,7 +650,7 @@ xfs_acl_vtoacl(
 */
 int
 xfs_acl_inherit(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        mode_t          mode,
        xfs_acl_t       *pdaclp)
 {
@@ -715,7 +709,7 @@ out_error:
 */
 STATIC int
 xfs_acl_setmode(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *acl,
        int             *basicperms)
 {
@@ -734,7 +728,7 @@ xfs_acl_setmode(
         * mode.  The m:: bits take precedence over the g:: bits.
         */
        iattr.ia_valid = ATTR_MODE;
-        iattr.ia_mode = xfs_vtoi(vp)->i_d.di_mode;
+        iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
        iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
        ap = acl->acl_entry;
        for (i = 0; i < acl->acl_cnt; ++i) {
@@ -764,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(xfs_vtoi(vp), &iattr, 0, sys_cred);
+        return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
 }
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 323ee94cf831..a4e293b93efa 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -59,14 +59,14 @@ extern struct kmem_zone *xfs_acl_zone;
                (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)      kmem_zone_destroy(zone)
-extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *);
+extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(bhv_vnode_t *);
+extern int xfs_acl_vhasacl_access(struct inode *);
-extern int xfs_acl_vhasacl_default(bhv_vnode_t *);
+extern int xfs_acl_vhasacl_default(struct inode *);
-extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int);
+extern int xfs_acl_vset(struct inode *, void *, size_t, int);
-extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int);
+extern int xfs_acl_vget(struct inode *, void *, size_t, int);
-extern int xfs_acl_vremove(bhv_vnode_t *, int);
+extern int xfs_acl_vremove(struct inode *, int);
 #define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f9472a2076d4..0b3b5efe848c 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -92,16 +92,6 @@
        ((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
    }
-/* define generic INT_ macros */
-#define INT_GET(reference,arch) \
-    (((arch) == ARCH_NOCONVERT) \
-        ? \
-            (reference) \
-        : \
-            INT_SWAP((reference),(reference)) \
-    )
 /* does not return a value */
 #define INT_SET(reference,arch,valueref) \
    (__builtin_constant_p(valueref) ? \
@@ -112,64 +102,6 @@
        ) \
    )
-/* does not return a value */
-#define INT_MOD_EXPR(reference,arch,code) \
-    (((arch) == ARCH_NOCONVERT) \
-        ? \
-            (void)((reference) code) \
-        : \
-            (void)( \
-                (reference) = INT_GET((reference),arch) , \
-                ((reference) code), \
-                INT_SET(reference, arch, reference) \
-            ) \
-    )
-/* does not return a value */
-#define INT_MOD(reference,arch,delta) \
-    (void)( \
-        INT_MOD_EXPR(reference,arch,+=(delta)) \
-    )
-/*
- * INT_COPY - copy a value between two locations with the
- *            _same architecture_ but _potentially different sizes_
- *
- *          if the types of the two parameters are equal or they are
- *              in native architecture, a simple copy is done
- *
- *          otherwise, architecture conversions are done
- *
- */
-/* does not return a value */
-#define INT_COPY(dst,src,arch) \
-    ( \
-        ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
-            ? \
-                (void)((dst) = (src)) \
-            : \
-                INT_SET(dst, arch, INT_GET(src, arch)) \
-    )
-/*
- * INT_XLATE - copy a value in either direction between two locations
- *             with different architectures
- *
- *                  dir < 0     - copy from memory to buffer (native to arch)
- *                  dir > 0     - copy from buffer to memory (arch to native)
- */
-/* does not return a value */
-#define INT_XLATE(buf,mem,dir,arch) {\
-    ASSERT(dir); \
-    if (dir>0) { \
-        (mem)=INT_GET(buf, arch); \
-    } else { \
-        INT_SET(buf, arch, mem); \
-    } \
-}
 /*
 * In directories inode numbers are stored as unaligned arrays of unsigned
 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 78de80e3caa2..f7cdc28aff41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -194,6 +194,46 @@ xfs_attr_get(
        return(error);
 }
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+        struct xfs_inode        *ip,
+        int                     namelen,
+        int                     valuelen,
+        int                     *local)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     size;
+        int                     nblks;
+        /*
+         * Determine space new attribute will use, and if it would be
+         * "local" or "remote" (note: local != inline).
+         */
+        size = xfs_attr_leaf_newentsize(namelen, valuelen,
+                                        mp->m_sb.sb_blocksize, local);
+        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+        if (*local) {
+                if (size > (mp->m_sb.sb_blocksize >> 1)) {
+                        /* Double split possible */
+                        nblks *= 2;
+                }
+        } else {
+                /*
+                 * Out of line attribute, cannot double split, but
+                 * make room for the attribute value itself.
+                 */
+                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
+                nblks += dblocks;
+                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+        }
+        return nblks;
+}
 STATIC int
 xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                char *value, int valuelen, int flags)
@@ -202,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        xfs_fsblock_t   firstblock;
        xfs_bmap_free_t flist;
        int             error, err2, committed;
-        int             local, size;
-        uint            nblks;
        xfs_mount_t     *mp = dp->i_mount;
        int             rsvd = (flags & ATTR_ROOT) != 0;
+        int             local;
        /*
         * Attach the dquots to the inode.
@@ -241,30 +280,8 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        args.whichfork = XFS_ATTR_FORK;
        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-        /*
-         * Determine space new attribute will use, and if it would be
-         * "local" or "remote" (note: local != inline).
-         */
-        size = xfs_attr_leaf_newentsize(name->len, valuelen,
-                                        mp->m_sb.sb_blocksize, &local);
-        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-        if (local) {
-                if (size > (mp->m_sb.sb_blocksize >> 1)) {
-                        /* Double split possible */
-                        nblks <<= 1;
-                }
-        } else {
-                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
-                /* Out of line attribute, cannot double split, but make
-                 * room for the attribute value itself.
-                 */
-                nblks += dblocks;
-                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
-        }
        /* Size is now blocks for attribute data */
-        args.total = nblks;
+        args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
        /*
         * Start our first transaction of the day.
@@ -286,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        if (rsvd)
                args.trans->t_flags |= XFS_TRANS_RESERVE;
-        if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
+        if ((error = xfs_trans_reserve(args.trans, args.total,
-                                      XFS_ATTRSET_LOG_RES(mp, nblks),
+                        XFS_ATTRSET_LOG_RES(mp, args.total), 0,
-                                      0, XFS_TRANS_PERM_LOG_RES,
+                        XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
-                                      XFS_ATTRSET_LOG_COUNT))) {
                xfs_trans_cancel(args.trans, 0);
                return(error);
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
+        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
-                         rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-                                XFS_QMOPT_RES_REGBLKS);
+                                       XFS_QMOPT_RES_REGBLKS);
        if (error) {
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
                xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
@@ -384,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                 * Commit the leaf transformation.  We'll need another (linked)
                 * transaction to add the new attribute to the leaf.
                 */
-                if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+                error = xfs_trans_roll(&args.trans, dp);
+                if (error)
                        goto out;
        }
@@ -964,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Commit the current trans (including the inode) and start
                 * a new one.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        return (error);
                /*
@@ -978,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         * Commit the transaction that added the attr name so that
         * later routines can manage their own transactions.
         */
-        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
                return (error);
        /*
@@ -1067,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                /*
                 * Commit the remove and start the next trans in series.
                 */
-                error = xfs_attr_rolltrans(&args->trans, dp);
+                error = xfs_trans_roll(&args->trans, dp);
        } else if (args->rmtblkno > 0) {
                /*
@@ -1298,7 +1318,8 @@ restart:
                         * Commit the node conversion and start the next
                         * trans in the chain.
                         */
-                        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        error = xfs_trans_roll(&args->trans, dp);
+                        if (error)
                                goto out;
                        goto restart;
@@ -1349,7 +1370,8 @@ restart:
         * Commit the leaf addition or btree split and start the next
         * trans in the chain.
         */
-        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
                goto out;
        /*
@@ -1449,7 +1471,8 @@ restart:
                /*
                 * Commit and start the next trans in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        goto out;
        } else if (args->rmtblkno > 0) {
@@ -1581,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                /*
                 * Commit the Btree join operation and start a new trans.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        goto out;
        }
@@ -2082,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                /*
                 * Start the next trans in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        return (error);
        }
@@ -2232,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                /*
                 * Close out trans and start the next one in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+                error = xfs_trans_roll(&args->trans, args->dp);
+                if (error)
                        return (error);
        }
        return(0);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 8b2d31c19e4d..fb3b2a68b9b9 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -129,6 +129,7 @@ typedef struct xfs_attr_list_context {
 /*
 * Overall external interface routines.
 */
+int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 23ef5d7c87e1..79da6b2ea99e 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2498,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return xfs_trans_roll(&args->trans, args->dp);
-        return(error);
 }
 /*
@@ -2547,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return xfs_trans_roll(&args->trans, args->dp);
-        return(error);
 }
 /*
@@ -2665,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        error = xfs_trans_roll(&args->trans, args->dp);
        return(error);
 }
@@ -2723,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
        /*
         * Commit the invalidate and start the next transaction.
         */
-        error = xfs_attr_rolltrans(trans, dp);
+        error = xfs_trans_roll(trans, dp);
        return (error);
 }
@@ -2825,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
                /*
                 * Atomically commit the whole invalidate stuff.
                 */
-                if ((error = xfs_attr_rolltrans(trans, dp)))
+                error = xfs_trans_roll(trans, dp);
+                if (error)
                        return (error);
        }
@@ -2964,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                        /*
                         * Roll to next transaction.
                         */
-                        if ((error = xfs_attr_rolltrans(trans, dp)))
+                        error = xfs_trans_roll(trans, dp);
+                        if (error)
                                return (error);
                }
@@ -2974,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
        return(0);
 }
-/*
- * Roll from one trans in the sequence of PERMANENT transactions to the next.
- */
-int
-xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
-{
-        xfs_trans_t *trans;
-        unsigned int logres, count;
-        int     error;
-        /*
-         * Ensure that the inode is always logged.
-         */
-        trans = *transp;
-        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-        /*
-         * Copy the critical parameters from one trans to the next.
-         */
-        logres = trans->t_log_res;
-        count = trans->t_log_count;
-        *transp = xfs_trans_dup(trans);
-        /*
-         * Commit the current transaction.
-         * If this commit failed, then it'd just unlock those items that
-         * are not marked ihold. That also means that a filesystem shutdown
-         * is in progress. The caller takes the responsibility to cancel
-         * the duplicate transaction that gets returned.
-         */
-        if ((error = xfs_trans_commit(trans, 0)))
-                return (error);
-        trans = *transp;
-        /*
-         * Reserve space in the log for th next transaction.
-         * This also pushes items in the "AIL", the list of logged items,
-         * out to disk if they are taking up space at the tail of the log
-         * that we want to use.  This requires that either nothing be locked
-         * across this call, or that anything that is locked be logged in
-         * the prior and the next transactions.
-         */
-        error = xfs_trans_reserve(trans, 0, logres, 0,
-                                  XFS_TRANS_PERM_LOG_RES, count);
-        /*
-         *  Ensure that the inode is in the new transaction and locked.
-         */
-        if (!error) {
-                xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(trans, dp);
-        }
-        return (error);
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 5ecf437b7825..83e9af417ca2 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -274,6 +274,4 @@ int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
                                   struct xfs_dabuf *leaf2_bp);
 int     xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
                                        int *local);
-int     xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index fab0b6d5a41b..48228848f5ae 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,109 +25,6 @@
 * XFS bit manipulation routines, used in non-realtime code.
 */
-#ifndef HAVE_ARCH_HIGHBIT
-/*
- * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
- */
-static const char xfs_highbit[256] = {
-       -1, 0, 1, 1, 2, 2, 2, 2,                 /* 00 .. 07 */
-        3, 3, 3, 3, 3, 3, 3, 3,                 /* 08 .. 0f */
-        4, 4, 4, 4, 4, 4, 4, 4,                 /* 10 .. 17 */
-        4, 4, 4, 4, 4, 4, 4, 4,                 /* 18 .. 1f */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 20 .. 27 */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 28 .. 2f */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 30 .. 37 */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 38 .. 3f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 40 .. 47 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 48 .. 4f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 50 .. 57 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 58 .. 5f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 60 .. 67 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 68 .. 6f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 70 .. 77 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 78 .. 7f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 80 .. 87 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 88 .. 8f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 90 .. 97 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 98 .. 9f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* a0 .. a7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* a8 .. af */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* b0 .. b7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* b8 .. bf */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* c0 .. c7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* c8 .. cf */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* d0 .. d7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* d8 .. df */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* e0 .. e7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* e8 .. ef */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* f0 .. f7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* f8 .. ff */
-};
-#endif
-/*
- * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
- */
-inline int
-xfs_highbit32(
-        __uint32_t      v)
-{
-#ifdef HAVE_ARCH_HIGHBIT
-        return highbit32(v);
-#else
-        int             i;
-        if (v & 0xffff0000)
-                if (v & 0xff000000)
-                        i = 24;
-                else
-                        i = 16;
-        else if (v & 0x0000ffff)
-                if (v & 0x0000ff00)
-                        i = 8;
-                else
-                        i = 0;
-        else
-                return -1;
-        return i + xfs_highbit[(v >> i) & 0xff];
-#endif
-}
-/*
- * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_lowbit64(
-        __uint64_t      v)
-{
-        __uint32_t      w = (__uint32_t)v;
-        int             n = 0;
-        if (w) {        /* lower bits */
-                n = ffs(w);
-        } else {        /* upper bits */
-                w = (__uint32_t)(v >> 32);
-                if (w && (n = ffs(w)))
-                        n += 32;
-        }
-        return n - 1;
-}
-/*
- * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_highbit64(
-        __uint64_t      v)
-{
-        __uint32_t      h = (__uint32_t)(v >> 32);
-        if (h)
-                return xfs_highbit32(h) + 32;
-        return xfs_highbit32((__uint32_t)v);
-}
 /*
 * Return whether bitmap is empty.
 * Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 082641a9782c..8e0e463dae2d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n)
 }
 /* Get high bit set out of 32-bit argument, -1 if none set */
-extern int xfs_highbit32(__uint32_t v);
+static inline int xfs_highbit32(__uint32_t v)
+{
+        return fls(v) - 1;
+}
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+        return fls64(v) - 1;
+}
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+        unsigned long   t = v;
+        return (v) ? find_first_bit(&t, 32) : -1;
+}
 /* Get low bit set out of 64-bit argument, -1 if none set */
-extern int xfs_lowbit64(__uint64_t v);
+static inline int xfs_lowbit64(__uint64_t v)
+{
+        __uint32_t      w = (__uint32_t)v;
+        int             n = 0;
-/* Get high bit set out of 64-bit argument, -1 if none set */
+        if (w) {        /* lower bits */
-extern int xfs_highbit64(__uint64_t);
+                n = ffs(w);
+        } else {        /* upper bits */
+                w = (__uint32_t)(v >> 32);
+                if (w && (n = ffs(w)))
+                n += 32;
+        }
+        return n - 1;
+}
 /* Return whether bitmap is empty (1 == empty) */
 extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3c4beb3a4326..a1aab9275d5a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -384,14 +384,14 @@ xfs_bmap_count_tree(
        int             levelin,
        int             *count);
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
        xfs_ifork_t             *ifp,
        xfs_extnum_t            idx,
        int                     numrecs,
        int                     *count);
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
        xfs_extnum_t            idx,
        xfs_bmbt_block_t        *block,
@@ -4000,7 +4000,7 @@ xfs_bmap_add_attrfork(
                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        }
        ASSERT(ip->i_d.di_anextents == 0);
-        VN_HOLD(XFS_ITOV(ip));
+        IHOLD(ip);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        switch (ip->i_d.di_format) {
@@ -6096,7 +6096,7 @@ xfs_bmap_get_bp(
                tp = cur->bc_tp;
                licp = &tp->t_items;
                while (!bp && licp != NULL) {
-                        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                        if (xfs_lic_are_all_free(licp)) {
                                licp = licp->lic_next;
                                continue;
                        }
@@ -6106,11 +6106,11 @@ xfs_bmap_get_bp(
                                xfs_buf_log_item_t      *bip;
                                xfs_buf_t               *lbp;
-                                if (XFS_LIC_ISFREE(licp, i)) {
+                                if (xfs_lic_isfree(licp, i)) {
                                        continue;
                                }
-                                lidp = XFS_LIC_SLOT(licp, i);
+                                lidp = xfs_lic_slot(licp, i);
                                lip = lidp->lid_item;
                                if (lip->li_type != XFS_LI_BUF)
                                        continue;
@@ -6367,13 +6367,9 @@ xfs_bmap_count_blocks(
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-                if (unlikely(xfs_bmap_count_leaves(ifp, 0,
+                xfs_bmap_count_leaves(ifp, 0,
                        ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-                        count) < 0)) {
+                        count);
-                        XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
-                                         XFS_ERRLEVEL_LOW, mp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
                return 0;
        }
@@ -6454,13 +6450,7 @@ xfs_bmap_count_tree(
                for (;;) {
                        nextbno = be64_to_cpu(block->bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                        if (unlikely(xfs_bmap_disk_count_leaves(0,
+                        xfs_bmap_disk_count_leaves(0, block, numrecs, count);
-                                        block, numrecs, count) < 0)) {
-                                xfs_trans_brelse(tp, bp);
-                                XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                return XFS_ERROR(EFSCORRUPTED);
-                        }
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6478,7 +6468,7 @@ xfs_bmap_count_tree(
 /*
 * Count leaf blocks given a range of extent records.
 */
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
        xfs_ifork_t             *ifp,
        xfs_extnum_t            idx,
@@ -6491,14 +6481,13 @@ xfs_bmap_count_leaves(
                xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
                *count += xfs_bmbt_get_blockcount(frp);
        }
-        return 0;
 }
 /*
 * Count leaf blocks given a range of extent records originally
 * in btree format.
 */
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
        xfs_extnum_t            idx,
        xfs_bmbt_block_t        *block,
@@ -6512,5 +6501,4 @@ xfs_bmap_disk_count_leaves(
                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index aeb87ca69fcc..cc593a84c345 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -46,38 +46,11 @@ kmem_zone_t	*xfs_btree_cur_zone;
 /*
 * Btree magic numbers.
 */
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
-{
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
 /*
- * Prototypes for internal routines.
- */
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                              /* number of records fitting in block */
-xfs_btree_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block);/* generic btree block pointer */
-/*
- * Internal routines.
- */
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
- */
-STATIC xfs_btree_block_t *                      /* generic btree block pointer */
-xfs_btree_get_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level in btree */
-        struct xfs_buf          **bpp); /* buffer containing the block */
-/*
 * Checking routine: return maxrecs for the block.
 */
 STATIC int                              /* number of records fitting in block */
@@ -457,35 +430,6 @@ xfs_btree_dup_cursor(
 }
 /*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_firstrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level)  /* level to change */
-{
-        xfs_btree_block_t       *block; /* generic btree block pointer */
-        xfs_buf_t               *bp;    /* buffer containing block */
-        /*
-         * Get the block pointer for this level.
-         */
-        block = xfs_btree_get_block(cur, level, &bp);
-        xfs_btree_check_block(cur, block, level, bp);
-        /*
-         * It's empty, there is no such record.
-         */
-        if (!block->bb_h.bb_numrecs)
-                return 0;
-        /*
-         * Set the ptr value to 1, that's the first record/key.
-         */
-        cur->bc_ptrs[level] = 1;
-        return 1;
-}
-/*
 * Retrieve the block pointer from the cursor at the given level.
 * This may be a bmap btree root or from a buffer.
 */
@@ -626,6 +570,13 @@ xfs_btree_init_cursor(
                cur->bc_private.a.agbp = agbp;
                cur->bc_private.a.agno = agno;
                break;
+        case XFS_BTNUM_INO:
+                /*
+                 * Inode allocation btree fields.
+                 */
+                cur->bc_private.a.agbp = agbp;
+                cur->bc_private.a.agno = agno;
+                break;
        case XFS_BTNUM_BMAP:
                /*
                 * Bmap btree fields.
@@ -638,13 +589,6 @@ xfs_btree_init_cursor(
                cur->bc_private.b.flags = 0;
                cur->bc_private.b.whichfork = whichfork;
                break;
-        case XFS_BTNUM_INO:
-                /*
-                 * Inode allocation btree fields.
-                 */
-                cur->bc_private.i.agbp = agbp;
-                cur->bc_private.i.agno = agno;
-                break;
        default:
                ASSERT(0);
        }
@@ -671,6 +615,35 @@ xfs_btree_islastblock(
 }
 /*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_firstrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        xfs_btree_block_t       *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_h.bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to 1, that's the first record/key.
+         */
+        cur->bc_ptrs[level] = 1;
+        return 1;
+}
+/*
 * Change the cursor to point to the last record in the current block
 * at the given level.  Other levels are unaffected.
 */
@@ -890,12 +863,12 @@ xfs_btree_readahead_core(
        case XFS_BTNUM_INO:
                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
                                be32_to_cpu(i->bb_leftsib), 1);
                        rval++;
                }
                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
                                be32_to_cpu(i->bb_rightsib), 1);
                        rval++;
                }
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7440b78f9cec..1f528a2a3754 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -158,8 +158,8 @@ typedef struct xfs_btree_cur
        __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
        xfs_btnum_t     bc_btnum;       /* identifies which btree type */
        union {
-                struct {                        /* needed for BNO, CNT */
+                struct {                        /* needed for BNO, CNT, INO */
-                        struct xfs_buf  *agbp;  /* agf buffer pointer */
+                        struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
                        xfs_agnumber_t  agno;   /* ag number */
                } a;
                struct {                        /* needed for BMAP */
@@ -172,10 +172,6 @@ typedef struct xfs_btree_cur
                        char            flags;          /* flags */
 #define XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
                } b;
-                struct {                        /* needed for INO */
-                        struct xfs_buf  *agbp;  /* agi buffer pointer */
-                        xfs_agnumber_t  agno;   /* ag number */
-                } i;
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d86ca2c03a70..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,12 +732,13 @@ xfs_buf_item_init(
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
        bip->bli_buf = bp;
+        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
        bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
        bip->bli_format.blf_map_size = map_size;
 #ifdef XFS_BLI_TRACE
-        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_TRANS_DEBUG
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
        return (bip->bli_flags & XFS_BLI_DIRTY);
 }
+STATIC void
+xfs_buf_item_free(
+        xfs_buf_log_item_t      *bip)
+{
+#ifdef XFS_TRANS_DEBUG
+        kmem_free(bip->bli_orig);
+        kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+#ifdef XFS_BLI_TRACE
+        ktrace_free(bip->bli_trace);
+#endif
+        kmem_zone_free(xfs_buf_item_zone, bip);
+}
 /*
 * This is called when the buf log item is no longer needed.  It should
 * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
            (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
                XFS_BUF_CLR_IODONE_FUNC(bp);
        }
+        xfs_buf_rele(bp);
-#ifdef XFS_TRANS_DEBUG
+        xfs_buf_item_free(bip);
-        kmem_free(bip->bli_orig);
-        bip->bli_orig = NULL;
-        kmem_free(bip->bli_logged);
-        bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
-        kmem_zone_free(xfs_buf_item_zone, bip);
 }
@@ -1056,7 +1062,7 @@ xfs_buf_iodone_callbacks(
                           anyway. */
                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_V_IODONESEMA(bp);
+                        XFS_BUF_FINISH_IOWAIT(bp);
                }
                return;
        }
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
        ASSERT(bip->bli_buf == bp);
+        xfs_buf_rele(bp);
        mp = bip->bli_item.li_mountp;
        /*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
         * xfs_trans_delete_ail() drops the AIL lock.
         */
        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+        xfs_buf_item_free(bip);
-#ifdef XFS_TRANS_DEBUG
-        kmem_free(bip->bli_orig);
-        bip->bli_orig = NULL;
-        kmem_free(bip->bli_logged);
-        bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
-        kmem_zone_free(xfs_buf_item_zone, bip);
 }
 #if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 2211e885ef24..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -128,10 +128,8 @@ xfs_swap_extents(
        xfs_swapext_t   *sxp)
 {
        xfs_mount_t     *mp;
-        xfs_inode_t     *ips[2];
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
-        bhv_vnode_t     *vp, *tvp;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
        int             ilf_fields, tilf_fields;
        static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -150,19 +148,15 @@ xfs_swap_extents(
        }
        sbp = &sxp->sx_stat;
-        vp = XFS_ITOV(ip);
-        tvp = XFS_ITOV(tip);
-        /* Lock in i_ino order */
-        if (ip->i_ino < tip->i_ino) {
-                ips[0] = ip;
-                ips[1] = tip;
-        } else {
-                ips[0] = tip;
-                ips[1] = ip;
-        }
-        xfs_lock_inodes(ips, 2, lock_flags);
+        /*
+         * we have to do two separate lock calls here to keep lockdep
+         * happy. If we try to get all the locks in one call, lock will
+         * report false positives when we drop the ILOCK and regain them
+         * below.
+         */
+        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        locked = 1;
        /* Verify that both files have the same format */
@@ -184,7 +178,7 @@ xfs_swap_extents(
                goto error0;
        }
-        if (VN_CACHED(tvp) != 0) {
+        if (VN_CACHED(VFS_I(tip)) != 0) {
                xfs_inval_cached_trace(tip, 0, -1, 0, -1);
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
@@ -193,7 +187,7 @@ xfs_swap_extents(
        }
        /* Verify O_DIRECT for ftmp */
-        if (VN_CACHED(tvp) != 0) {
+        if (VN_CACHED(VFS_I(tip)) != 0) {
                error = XFS_ERROR(EINVAL);
                goto error0;
        }
@@ -237,7 +231,7 @@ xfs_swap_extents(
         * vop_read (or write in the case of autogrow) they block on the iolock
         * until we have switched the extents.
         */
-        if (VN_MAPPED(vp)) {
+        if (VN_MAPPED(VFS_I(ip))) {
                error = XFS_ERROR(EBUSY);
                goto error0;
        }
@@ -265,7 +259,7 @@ xfs_swap_extents(
                locked = 0;
                goto error0;
        }
-        xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        /*
         * Count the number of extended attribute blocks
@@ -350,15 +344,11 @@ xfs_swap_extents(
                break;
        }
-        /*
-         * Increment vnode ref counts since xfs_trans_commit &
-         * xfs_trans_cancel will both unlock the inodes and
-         * decrement the associated ref counts.
-         */
-        VN_HOLD(vp);
-        VN_HOLD(tvp);
+        IHOLD(ip);
        xfs_trans_ijoin(tp, ip, lock_flags);
+        IHOLD(tip);
        xfs_trans_ijoin(tp, tip, lock_flags);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index cdc2d3464a1a..2813cdd72375 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DMAPI_H__
 #define __XFS_DMAPI_H__
-#include <linux/version.h>
 /*      Values used to define the on-disk version of dm_attrname_t. All
 *      on-disk attribute names start with the 8-byte string "SGI_DMI_".
 *
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f66756cfb5e8..f227ecd1a294 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,9 +58,6 @@ xfs_error_trap(int e)
        }
        return e;
 }
-#endif
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
@@ -154,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
        return 0;
 }
-#endif /* DEBUG || INDUCE_IO_ERROR */
+#endif /* DEBUG */
 static void
 xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index d8559d132efa..11543f10b0c6 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -125,22 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
 #define XFS_RANDOM_DIOWRITE_IOERR                       (XFS_RANDOM_DEFAULT/10)
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+#ifdef DEBUG
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 #define XFS_NUM_INJECT_ERROR                            10
-#ifdef __ANSI_CPP__
-#define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
-         xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
-                         (rf)))
-#else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
        ((expr) || \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf)))
-#endif /* __ANSI_CPP__ */
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
@@ -148,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
 #define xfs_errortag_clearall(mp, loud)         (ENOSYS)
-#endif /* (DEBUG || INDUCE_IO_ERROR) */
+#endif /* DEBUG */
 /*
 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c38fd14fca29..f3bb75da384e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -400,7 +400,7 @@ xfs_filestream_init(void)
        if (!item_zone)
                return -ENOMEM;
 #ifdef XFS_FILESTREAMS_TRACE
-        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
+        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
 #endif
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index e5310c90e50f..83502f3edef0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -181,7 +181,7 @@ xfs_inobt_delrec(
                 * then we can get rid of this level.
                 */
                if (numrecs == 1 && level > 0) {
-                        agbp = cur->bc_private.i.agbp;
+                        agbp = cur->bc_private.a.agbp;
                        agi = XFS_BUF_TO_AGI(agbp);
                        /*
                         * pp is still set to the first pointer in the block.
@@ -194,7 +194,7 @@ xfs_inobt_delrec(
                         * Free the block.
                         */
                        if ((error = xfs_free_extent(cur->bc_tp,
-                                XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
+                                XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
                                return error;
                        xfs_trans_binval(cur->bc_tp, bp);
                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
@@ -379,7 +379,7 @@ xfs_inobt_delrec(
                rrecs = be16_to_cpu(right->bb_numrecs);
                rbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, lbno, 0, &lbp,
+                                cur->bc_private.a.agno, lbno, 0, &lbp,
                                XFS_INO_BTREE_REF)))
                        return error;
                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -401,7 +401,7 @@ xfs_inobt_delrec(
                lrecs = be16_to_cpu(left->bb_numrecs);
                lbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, rbno, 0, &rbp,
+                                cur->bc_private.a.agno, rbno, 0, &rbp,
                                XFS_INO_BTREE_REF)))
                        return error;
                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -484,7 +484,7 @@ xfs_inobt_delrec(
                xfs_buf_t               *rrbp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0,
+                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
                                &rrbp, XFS_INO_BTREE_REF)))
                        return error;
                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
@@ -497,7 +497,7 @@ xfs_inobt_delrec(
         * Free the deleting block.
         */
        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                     cur->bc_private.i.agno, rbno), 1)))
+                                     cur->bc_private.a.agno, rbno), 1)))
                return error;
        xfs_trans_binval(cur->bc_tp, rbp);
        /*
@@ -854,7 +854,7 @@ xfs_inobt_lookup(
        {
                xfs_agi_t       *agi;   /* a.g. inode header */
-                agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+                agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
                agno = be32_to_cpu(agi->agi_seqno);
                agbno = be32_to_cpu(agi->agi_root);
        }
@@ -1089,7 +1089,7 @@ xfs_inobt_lshift(
         * Set up the left neighbor as "left".
         */
        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib),
+                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
                        0, &lbp, XFS_INO_BTREE_REF)))
                return error;
        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -1207,10 +1207,10 @@ xfs_inobt_newroot(
        /*
         * Get a block & a buffer.
         */
-        agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+        agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
        args.tp = cur->bc_tp;
        args.mp = cur->bc_mp;
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
                be32_to_cpu(agi->agi_root));
        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
                args.isfl = args.userdata = args.minalignslop = 0;
@@ -1233,7 +1233,7 @@ xfs_inobt_newroot(
         */
        agi->agi_root = cpu_to_be32(args.agbno);
        be32_add_cpu(&agi->agi_level, 1);
-        xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+        xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
                XFS_AGI_ROOT | XFS_AGI_LEVEL);
        /*
         * At the previous root level there are now two blocks: the old
@@ -1376,7 +1376,7 @@ xfs_inobt_rshift(
         * Set up the right neighbor as "right".
         */
        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib),
+                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
                        0, &rbp, XFS_INO_BTREE_REF)))
                return error;
        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -1492,7 +1492,7 @@ xfs_inobt_split(
         * Allocate the new block.
         * If we can't do it, we're toast.  Give up.
         */
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
                args.isfl = args.userdata = args.minalignslop = 0;
        args.minlen = args.maxlen = args.prod = 1;
@@ -1725,7 +1725,7 @@ xfs_inobt_decrement(
                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
                                XFS_INO_BTREE_REF)))
                        return error;
                lev--;
@@ -1897,7 +1897,7 @@ xfs_inobt_increment(
                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
                                XFS_INO_BTREE_REF)))
                        return error;
                lev--;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b07604b94d9f..e229e9e001c2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -216,7 +216,14 @@ finish_inode:
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        init_waitqueue_head(&ip->i_ipin_wait);
        atomic_set(&ip->i_pincount, 0);
-        initnsema(&ip->i_flock, 1, "xfsfino");
+        /*
+         * Because we want to use a counting completion, complete
+         * the flush completion once to allow a single access to
+         * the flush completion without blocking.
+         */
+        init_completion(&ip->i_flush);
+        complete(&ip->i_flush);
        if (lock_flags)
                xfs_ilock(ip, lock_flags);
@@ -288,10 +295,17 @@ finish_inode:
        *ipp = ip;
        /*
+         * Set up the Linux with the Linux inode.
+         */
+        ip->i_vnode = inode;
+        inode->i_private = ip;
+        /*
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-        xfs_initialize_vnode(mp, inode, ip);
+        if (ip->i_d.di_mode != 0)
+                xfs_setup_inode(ip);
        return 0;
 }
@@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t	*ip,
 * Special iput for brand-new inodes that are still locked
 */
 void
-xfs_iput_new(xfs_inode_t        *ip,
+xfs_iput_new(
-             uint               lock_flags)
+        xfs_inode_t     *ip,
+        uint            lock_flags)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        xfs_itrace_entry(ip);
@@ -775,26 +790,3 @@ xfs_isilocked(
 }
 #endif
-/*
- * The following three routines simply manage the i_flock
- * semaphore embedded in the inode.  This semaphore synchronizes
- * processes attempting to flush the in-core inode back to disk.
- */
-void
-xfs_iflock(xfs_inode_t *ip)
-{
-        psema(&(ip->i_flock), PINOD|PLTWAIT);
-}
-int
-xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return (cpsema(&(ip->i_flock)));
-}
-void
-xfs_ifunlock(xfs_inode_t *ip)
-{
-        ASSERT(issemalocked(&(ip->i_flock)));
-        vsema(&(ip->i_flock));
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bedc66163176..dbd9cef852ec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -580,8 +580,8 @@ xfs_iformat_extents(
                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
                for (i = 0; i < nex; i++, dp++) {
                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-                        ep->l0 = be64_to_cpu(get_unaligned(&dp->l0));
+                        ep->l0 = get_unaligned_be64(&dp->l0);
-                        ep->l1 = be64_to_cpu(get_unaligned(&dp->l1));
+                        ep->l1 = get_unaligned_be64(&dp->l1);
                }
                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
                if (whichfork != XFS_DATA_FORK ||
@@ -835,22 +835,22 @@ xfs_iread(
         * Do this before xfs_iformat in case it adds entries.
         */
 #ifdef  XFS_INODE_TRACE
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP);
+        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMAP_TRACE
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMBT_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
        /*
@@ -1046,9 +1046,9 @@ xfs_ialloc(
 {
        xfs_ino_t       ino;
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp;
        uint            flags;
        int             error;
+        timespec_t      tv;
        /*
         * Call the space management code to pick
@@ -1077,13 +1077,12 @@ xfs_ialloc(
        }
        ASSERT(ip != NULL);
-        vp = XFS_ITOV(ip);
        ip->i_d.di_mode = (__uint16_t)mode;
        ip->i_d.di_onlink = 0;
        ip->i_d.di_nlink = nlink;
        ASSERT(ip->i_d.di_nlink == nlink);
-        ip->i_d.di_uid = current_fsuid(cr);
+        ip->i_d.di_uid = current_fsuid();
-        ip->i_d.di_gid = current_fsgid(cr);
+        ip->i_d.di_gid = current_fsgid();
        ip->i_d.di_projid = prid;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
@@ -1130,7 +1129,13 @@ xfs_ialloc(
        ip->i_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+        nanotime(&tv);
+        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+        ip->i_d.di_atime = ip->i_d.di_mtime;
+        ip->i_d.di_ctime = ip->i_d.di_mtime;
        /*
         * di_gen will have been taken care of in xfs_iread.
         */
@@ -1220,7 +1225,7 @@ xfs_ialloc(
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
-        xfs_initialize_vnode(tp->t_mountp, vp, ip);
+        xfs_setup_inode(ip);
        *ipp = ip;
        return 0;
@@ -1399,7 +1404,6 @@ xfs_itruncate_start(
        xfs_fsize_t     last_byte;
        xfs_off_t       toss_start;
        xfs_mount_t     *mp;
-        bhv_vnode_t     *vp;
        int             error = 0;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1408,7 +1412,6 @@ xfs_itruncate_start(
               (flags == XFS_ITRUNC_MAYBE));
        mp = ip->i_mount;
-        vp = XFS_ITOV(ip);
        /* wait for the completion of any pending DIOs */
        if (new_size < ip->i_size)
@@ -1457,7 +1460,7 @@ xfs_itruncate_start(
 #ifdef DEBUG
        if (new_size == 0) {
-                ASSERT(VN_CACHED(vp) == 0);
+                ASSERT(VN_CACHED(VFS_I(ip)) == 0);
        }
 #endif
        return error;
@@ -2630,7 +2633,6 @@ xfs_idestroy(
                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
        mrfree(&ip->i_lock);
        mrfree(&ip->i_iolock);
-        freesema(&ip->i_flock);
 #ifdef XFS_INODE_TRACE
        ktrace_free(ip->i_trace);
@@ -3048,10 +3050,10 @@ cluster_corrupt_out:
 /*
 * xfs_iflush() will write a modified inode's changes out to the
 * inode's on disk home.  The caller must have the inode lock held
- * in at least shared mode and the inode flush semaphore must be
+ * in at least shared mode and the inode flush completion must be
- * held as well.  The inode lock will still be held upon return from
+ * active as well.  The inode lock will still be held upon return from
 * the call and the caller is free to unlock it.
- * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The inode flush will be completed when the inode reaches the disk.
 * The flags indicate how the inode's buffer should be written out.
 */
 int
@@ -3070,7 +3072,7 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3233,7 +3235,7 @@ xfs_iflush_int(
 #endif
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3465,7 +3467,6 @@ xfs_iflush_all(
        xfs_mount_t     *mp)
 {
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp;
 again:
        XFS_MOUNT_ILOCK(mp);
@@ -3480,14 +3481,13 @@ xfs_iflush_all(
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                if (!VFS_I(ip)) {
-                if (!vp) {
                        XFS_MOUNT_IUNLOCK(mp);
                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
                        goto again;
                }
-                ASSERT(vn_count(vp) == 0);
+                ASSERT(vn_count(VFS_I(ip)) == 0);
                ip = ip->i_mnext;
        } while (ip != mp->m_inodes);
@@ -3707,7 +3707,7 @@ xfs_iext_add_indirect_multi(
         * (all extents past */
        if (nex2) {
                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP);
+                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
                erp->er_extcount -= nex2;
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
@@ -4007,8 +4007,7 @@ xfs_iext_realloc_direct(
                        ifp->if_u1.if_extents =
                                kmem_realloc(ifp->if_u1.if_extents,
                                                rnew_size,
-                                                ifp->if_real_bytes,
+                                                ifp->if_real_bytes, KM_NOFS);
-                                                KM_SLEEP);
                }
                if (rnew_size > ifp->if_real_bytes) {
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -4067,7 +4066,7 @@ xfs_iext_inline_to_direct(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        int             new_size)       /* number of extents in file */
 {
-        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP);
+        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
        memset(ifp->if_u1.if_extents, 0, new_size);
        if (ifp->if_bytes) {
                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
@@ -4099,7 +4098,7 @@ xfs_iext_realloc_indirect(
        } else {
                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
                        kmem_realloc(ifp->if_u1.if_ext_irec,
-                                new_size, size, KM_SLEEP);
+                                new_size, size, KM_NOFS);
        }
 }
@@ -4119,7 +4118,7 @@ xfs_iext_indirect_to_direct(
        ASSERT(nextents <= XFS_LINEAR_EXTS);
        size = nextents * sizeof(xfs_bmbt_rec_t);
-        xfs_iext_irec_compact_full(ifp);
+        xfs_iext_irec_compact_pages(ifp);
        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
        ep = ifp->if_u1.if_ext_irec->er_extbuf;
@@ -4341,11 +4340,10 @@ xfs_iext_irec_init(
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        ASSERT(nextents <= XFS_LINEAR_EXTS);
-        erp = (xfs_ext_irec_t *)
+        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-                kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
        if (nextents == 0) {
-                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
        } else if (!ifp->if_real_bytes) {
                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
@@ -4393,7 +4391,7 @@ xfs_iext_irec_new(
        /* Initialize new extent record */
        erp = ifp->if_u1.if_ext_irec;
-        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
        erp[erp_idx].er_extcount = 0;
@@ -4451,8 +4449,7 @@ xfs_iext_irec_remove(
 * compaction policy is as follows:
 *
 *    Full Compaction: Extents fit into a single page (or inline buffer)
- *    Full Compaction: Extents occupy less than 10% of allocated space
+ * Partial Compaction: Extents occupy less than 50% of allocated space
- * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
 *      No Compaction: Extents occupy at least 50% of allocated space
 */
 void
@@ -4473,8 +4470,6 @@ xfs_iext_irec_compact(
                xfs_iext_direct_to_inline(ifp, nextents);
        } else if (nextents <= XFS_LINEAR_EXTS) {
                xfs_iext_indirect_to_direct(ifp);
-        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
-                xfs_iext_irec_compact_full(ifp);
        } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
                xfs_iext_irec_compact_pages(ifp);
        }
@@ -4498,7 +4493,7 @@ xfs_iext_irec_compact_pages(
                erp_next = erp + 1;
                if (erp_next->er_extcount <=
                    (XFS_LINEAR_EXTS - erp->er_extcount)) {
-                        memmove(&erp->er_extbuf[erp->er_extcount],
+                        memcpy(&erp->er_extbuf[erp->er_extcount],
                                erp_next->er_extbuf, erp_next->er_extcount *
                                sizeof(xfs_bmbt_rec_t));
                        erp->er_extcount += erp_next->er_extcount;
@@ -4518,91 +4513,6 @@ xfs_iext_irec_compact_pages(
 }
 /*
- * Fully compact the extent records managed by the indirection array.
- */
-void
-xfs_iext_irec_compact_full(
-        xfs_ifork_t     *ifp)                   /* inode fork pointer */
-{
-        xfs_bmbt_rec_host_t *ep, *ep_next;      /* extent record pointers */
-        xfs_ext_irec_t  *erp, *erp_next;        /* extent irec pointers */
-        int             erp_idx = 0;            /* extent irec index */
-        int             ext_avail;              /* empty entries in ex list */
-        int             ext_diff;               /* number of exts to add */
-        int             nlists;                 /* number of irec's (ex lists) */
-        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-        erp = ifp->if_u1.if_ext_irec;
-        ep = &erp->er_extbuf[erp->er_extcount];
-        erp_next = erp + 1;
-        ep_next = erp_next->er_extbuf;
-        while (erp_idx < nlists - 1) {
-                /*
-                 * Check how many extent records are available in this irec.
-                 * If there is none skip the whole exercise.
-                 */
-                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-                if (ext_avail) {
-                        /*
-                         * Copy over as many as possible extent records into
-                         * the previous page.
-                         */
-                        ext_diff = MIN(ext_avail, erp_next->er_extcount);
-                        memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
-                        erp->er_extcount += ext_diff;
-                        erp_next->er_extcount -= ext_diff;
-                        /*
-                         * If the next irec is empty now we can simply
-                         * remove it.
-                         */
-                        if (erp_next->er_extcount == 0) {
-                                /*
-                                 * Free page before removing extent record
-                                 * so er_extoffs don't get modified in
-                                 * xfs_iext_irec_remove.
-                                 */
-                                kmem_free(erp_next->er_extbuf);
-                                erp_next->er_extbuf = NULL;
-                                xfs_iext_irec_remove(ifp, erp_idx + 1);
-                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-                        /*
-                         * If the next irec is not empty move up the content
-                         * that has not been copied to the previous page to
-                         * the beggining of this one.
-                         */
-                        } else {
-                                memmove(erp_next->er_extbuf, &ep_next[ext_diff],
-                                        erp_next->er_extcount *
-                                        sizeof(xfs_bmbt_rec_t));
-                                ep_next = erp_next->er_extbuf;
-                                memset(&ep_next[erp_next->er_extcount], 0,
-                                        (XFS_LINEAR_EXTS -
-                                                erp_next->er_extcount) *
-                                        sizeof(xfs_bmbt_rec_t));
-                        }
-                }
-                if (erp->er_extcount == XFS_LINEAR_EXTS) {
-                        erp_idx++;
-                        if (erp_idx < nlists)
-                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                        else
-                                break;
-                }
-                ep = &erp->er_extbuf[erp->er_extcount];
-                erp_next = erp + 1;
-                ep_next = erp_next->er_extbuf;
-        }
-}
-/*
 * This is called to update the er_extoff field in the indirection
 * array when extents have been added or removed from one of the
 * extent lists. erp_idx contains the irec index to begin updating
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 17a04b6321ed..1420c49674d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -87,8 +87,7 @@ typedef struct xfs_ifork {
 * Flags for xfs_ichgtime().
 */
 #define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define XFS_ICHGTIME_ACC        0x2     /* data fork access timestamp */
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-#define XFS_ICHGTIME_CHG        0x4     /* inode field change timestamp */
 /*
 * Per-fork incore inode flags.
@@ -204,7 +203,7 @@ typedef struct xfs_inode {
        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
        struct list_head        i_reclaim;      /* reclaim list */
-        bhv_vnode_t             *i_vnode;       /* vnode backpointer */
+        struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
@@ -223,7 +222,7 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
-        sema_t                  i_flock;        /* inode flush lock */
+        struct completion       i_flush;        /* inode flush completion q */
        atomic_t                i_pincount;     /* inode pin count */
        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
@@ -263,6 +262,18 @@ typedef struct xfs_inode {
 #define XFS_ISIZE(ip)   (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
                                (ip)->i_size : (ip)->i_d.di_size;
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+        return (struct xfs_inode *)inode->i_private;
+}
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+        return (struct inode *)ip->i_vnode;
+}
 /*
 * i_flags helper functions
 */
@@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define XFS_ITRUNC_DEFINITE     0x1
 #define XFS_ITRUNC_MAYBE        0x2
-#define XFS_ITOV(ip)            ((ip)->i_vnode)
-#define XFS_ITOV_NULL(ip)       ((ip)->i_vnode)
 /*
 * For multiple groups support: if S_ISGID bit is set in the parent
 * directory, group of new file is set to that of the parent, and
@@ -473,11 +481,8 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
 void            xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
-void            xfs_iflock(xfs_inode_t *);
-int             xfs_iflock_nowait(xfs_inode_t *);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_ifunlock(xfs_inode_t *);
 void            xfs_ireclaim(xfs_inode_t *);
 int             xfs_finish_reclaim(xfs_inode_t *, int, int);
 int             xfs_finish_reclaim_all(struct xfs_mount *, int);
@@ -522,6 +527,7 @@ void		xfs_iflush_all(struct xfs_mount *);
 void            xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
+void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_atime(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
@@ -570,6 +576,26 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
+/*
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+        wait_for_completion(&ip->i_flush);
+}
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+        return try_wait_for_completion(&ip->i_flush);
+}
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+        complete(&ip->i_flush);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0eee08a32c26..97c7452e2620 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -779,11 +779,10 @@ xfs_inode_item_pushbuf(
        ASSERT(iip->ili_push_owner == current_pid());
        /*
-         * If flushlock isn't locked anymore, chances are that the
+         * If a flush is not in progress anymore, chances are that the
-         * inode flush completed and the inode was taken off the AIL.
+         * inode was taken off the AIL. So, just get out.
-         * So, just get out.
         */
-        if (!issemalocked(&(ip->i_flock)) ||
+        if (completion_done(&ip->i_flush) ||
            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                iip->ili_pushbuf_flag = 0;
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -805,7 +804,7 @@ xfs_inode_item_pushbuf(
                         * If not, we can flush it async.
                         */
                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-                                  issemalocked(&(ip->i_flock)));
+                                  !completion_done(&ip->i_flush));
                        iip->ili_pushbuf_flag = 0;
                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        xfs_buftrace("INODE ITEM PUSH", bp);
@@ -858,7 +857,7 @@ xfs_inode_item_push(
        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        /*
         * Since we were able to lock the inode's flush lock and
         * we found it on the AIL, the inode must be dirty.  This
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 9a3ef9dcaeb9..cf6754a3c5b3 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,7 +59,6 @@ xfs_bulkstat_one_iget(
 {
        xfs_icdinode_t  *dic;   /* dinode core info pointer */
        xfs_inode_t     *ip;            /* incore inode pointer */
-        bhv_vnode_t     *vp;
        int             error;
        error = xfs_iget(mp, NULL, ino,
@@ -72,7 +71,6 @@ xfs_bulkstat_one_iget(
        ASSERT(ip != NULL);
        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
-        vp = XFS_ITOV(ip);
        dic = &ip->i_d;
        /* xfs_iget returns the following without needing
@@ -85,7 +83,7 @@ xfs_bulkstat_one_iget(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-        vn_atime_to_bstime(vp, &buf->bs_atime);
+        vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 91b00a5686cd..0b02c6443551 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 #if defined(XFS_LOG_TRACE)
+#define XLOG_TRACE_LOGGRANT_SIZE        2048
+#define XLOG_TRACE_ICLOG_SIZE           256
+void
+xlog_trace_loggrant_alloc(xlog_t *log)
+{
+        log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
+}
+void
+xlog_trace_loggrant_dealloc(xlog_t *log)
+{
+        ktrace_free(log->l_grant_trace);
+}
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
        unsigned long cnts;
-        if (!log->l_grant_trace) {
-                log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
-                if (!log->l_grant_trace)
-                        return;
-        }
        /* ticket counts are 1 byte each */
        cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 }
 void
+xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
+{
+        iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
+}
+void
+xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
+{
+        ktrace_free(iclog->ic_trace);
+}
+void
 xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 {
-        if (!iclog->ic_trace)
-                iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
        ktrace_enter(iclog->ic_trace,
                     (void *)((unsigned long)state),
                     (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
                     (void *)NULL, (void *)NULL);
 }
 #else
+#define xlog_trace_loggrant_alloc(log)
+#define xlog_trace_loggrant_dealloc(log)
 #define xlog_trace_loggrant(log,tic,string)
+#define xlog_trace_iclog_alloc(iclog)
+#define xlog_trace_iclog_dealloc(iclog)
 #define xlog_trace_iclog(iclog,state)
 #endif /* XFS_LOG_TRACE */
@@ -336,15 +364,12 @@ xfs_log_done(xfs_mount_t	*mp,
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
-        }
+                /* If this ticket was a permanent reservation and we aren't
+                 * trying to release it, reset the inited flags; so next time
-        /* If this ticket was a permanent reservation and we aren't
+                 * we write, a start record will be written out.
-         * trying to release it, reset the inited flags; so next time
+                 */
-         * we write, a start record will be written out.
-         */
-        if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
-            (flags & XFS_LOG_REL_PERM_RESERV) == 0)
                ticket->t_flags |= XLOG_TIC_INITED;
+        }
        return lsn;
 }       /* xfs_log_done */
@@ -357,11 +382,11 @@ xfs_log_done(xfs_mount_t	*mp,
 * Asynchronous forces are implemented by setting the WANT_SYNC
 * bit in the appropriate in-core log and then returning.
 *
- * Synchronous forces are implemented with a semaphore.  All callers
+ * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a semaphore attached to the
+ * to force a given lsn to disk will wait on a the sv attached to the
 * specific in-core log.  When given in-core log finally completes its
 * write to disk, that thread will wake up all threads waiting on the
- * semaphore.
+ * sv.
 */
 int
 _xfs_log_force(
@@ -588,12 +613,12 @@ error:
 * mp           - ubiquitous xfs mount point structure
 */
 int
-xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+xfs_log_mount_finish(xfs_mount_t *mp)
 {
        int     error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                error = xlog_recover_finish(mp->m_log, mfsi_flags);
+                error = xlog_recover_finish(mp->m_log);
        else {
                error = 0;
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -707,7 +732,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                sv_wait(&iclog->ic_force_wait, PMEM,
                                        &log->l_icloglock, s);
                        } else {
                                spin_unlock(&log->l_icloglock);
@@ -748,7 +773,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                sv_wait(&iclog->ic_force_wait, PMEM,
                                        &log->l_icloglock, s);
                } else {
                        spin_unlock(&log->l_icloglock);
@@ -838,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_write_headq);
        }
@@ -859,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_reserve_headq);
        }
@@ -1008,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
        l = iclog->ic_log;
        /*
-         * If the ordered flag has been removed by a lower
+         * If the _XFS_BARRIER_FAILED flag was set by a lower
-         * layer, it means the underlyin device no longer supports
+         * layer, it means the underlying device no longer supports
         * barrier I/O. Warn loudly and turn off barriers.
         */
-        if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) {
+        if (bp->b_flags & _XFS_BARRIER_FAILED) {
+                bp->b_flags &= ~_XFS_BARRIER_FAILED;
                l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
                xfs_fs_cmn_err(CE_WARN, l->l_mp,
                                "xlog_iodone: Barriers are no longer supported"
@@ -1234,6 +1260,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        spin_lock_init(&log->l_grant_lock);
        sv_init(&log->l_flush_wait, 0, "flush_wait");
+        xlog_trace_loggrant_alloc(log);
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1285,8 +1312,10 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
+                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
-                sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
+                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                xlog_trace_iclog_alloc(iclog);
                iclogp = &iclog->ic_next;
        }
@@ -1565,14 +1594,10 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_forcesema);
+                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_writesema);
+                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
-#ifdef XFS_LOG_TRACE
+                xlog_trace_iclog_dealloc(iclog);
-                if (iclog->ic_trace != NULL) {
-                        ktrace_free(iclog->ic_trace);
-                }
-#endif
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
                iclog = next_iclog;
@@ -1581,14 +1606,7 @@ xlog_dealloc_log(xlog_t *log)
        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
-#ifdef XFS_LOG_TRACE
+        xlog_trace_loggrant_dealloc(log);
-        if (log->l_trace != NULL) {
-                ktrace_free(log->l_trace);
-        }
-        if (log->l_grant_trace != NULL) {
-                ktrace_free(log->l_grant_trace);
-        }
-#endif
        log->l_mp->m_log = NULL;
        kmem_free(log);
 }       /* xlog_dealloc_log */
@@ -1976,7 +1994,7 @@ xlog_write(xfs_mount_t *	mp,
 /* Clean iclogs starting from the head.  This ordering must be
 * maintained, so an iclog doesn't become ACTIVE beyond one that
 * is SYNCING.  This is also required to maintain the notion that we use
- * a counting semaphore to hold off would be writers to the log when every
+ * a ordered wait queue to hold off would be writers to the log when every
 * iclog is trying to sync to disk.
 *
 * State Change: DIRTY -> ACTIVE
@@ -2240,7 +2258,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_forcesema);
+                        sv_broadcast(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2302,8 +2320,7 @@ xlog_state_do_callback(
 * the second completion goes through.
 *
 * Callbacks could take time, so they are done outside the scope of the
- * global state machine log lock.  Assume that the calls to cvsema won't
+ * global state machine log lock.
- * take a long time.  At least we know it won't sleep.
 */
 STATIC void
 xlog_state_done_syncing(
@@ -2339,7 +2356,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_writesema);
+        sv_broadcast(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2347,11 +2364,9 @@ xlog_state_done_syncing(
 /*
 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
- * sleep.  The flush semaphore is set to the number of in-core buffers and
+ * sleep.  We wait on the flush queue on the head iclog as that should be
- * decremented around disk syncing.  Therefore, if all buffers are syncing,
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
- * this semaphore will cause new writes to sleep until a sync completes.
+ * we will wait here and all new writes will sleep until a sync completes.
- * Otherwise, this code just does p() followed by v().  This approximates
- * a sleep/wakeup except we can't race.
 *
 * The in-core logs are used in a circular fashion. They are not used
 * out-of-order even when an iclog past the head is free.
@@ -2508,7 +2523,7 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
@@ -2534,7 +2549,7 @@ redo:
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: sleep 2");
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                if (XLOG_FORCED_SHUTDOWN(log)) {
                        spin_lock(&log->l_grant_lock);
@@ -2633,7 +2648,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_sema);
+                        sv_signal(&ntic->t_wait);
                        ntic = ntic->t_next;
                } while (ntic != log->l_write_headq);
@@ -2644,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: sleep 1");
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
                                &log->l_grant_lock, s);
                        /* If we're shutting down, this tic is already
@@ -2673,7 +2688,7 @@ redo:
                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                        xlog_ins_ticketq(&log->l_write_headq, tic);
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /* If we're shutting down, this tic is already off the queue */
                if (XLOG_FORCED_SHUTDOWN(log)) {
@@ -2916,7 +2931,7 @@ xlog_state_switch_iclogs(xlog_t		*log,
 *      2. the current iclog is drity, and the previous iclog is in the
 *              active or dirty state.
 *
- * We may sleep (call psema) if:
+ * We may sleep if:
 *
 *      1. the current iclog is not in the active nor dirty state.
 *      2. the current iclog dirty, and the previous iclog is not in the
@@ -3013,7 +3028,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3096,7 +3111,7 @@ try_again:
                                                 XLOG_STATE_SYNCING))) {
                        ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
+                        sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
                                &log->l_icloglock, s);
                        *log_flushed = 1;
                        already_slept = 1;
@@ -3116,7 +3131,7 @@ try_again:
            !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
                /*
-                 * Don't wait on the forcesema if we know that we've
+                 * Don't wait on completion if we know that we've
                 * gotten a log write error.
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR) {
@@ -3124,7 +3139,7 @@ try_again:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+                sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3180,7 +3195,7 @@ STATIC void
 xlog_ticket_put(xlog_t          *log,
                xlog_ticket_t   *ticket)
 {
-        sv_destroy(&ticket->t_sema);
+        sv_destroy(&ticket->t_wait);
        kmem_zone_free(xfs_log_ticket_zone, ticket);
 }       /* xlog_ticket_put */
@@ -3270,7 +3285,7 @@ xlog_ticket_get(xlog_t		*log,
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3557,14 +3572,14 @@ xfs_log_force_umount(
         */
        if ((tic = log->l_reserve_headq)) {
                do {
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_reserve_headq);
        }
        if ((tic = log->l_write_headq)) {
                do {
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_write_headq);
        }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d1d678ecb63e..d47b91f10822 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -149,7 +149,7 @@ int	  xfs_log_mount(struct xfs_mount	*mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
                        int                     num_bblocks);
-int       xfs_log_mount_finish(struct xfs_mount *mp, int);
+int       xfs_log_mount_finish(struct xfs_mount *mp);
 void      xfs_log_move_tail(struct xfs_mount    *mp,
                            xfs_lsn_t           tail_lsn);
 int       xfs_log_notify(struct xfs_mount       *mp,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 6245913196b4..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -241,7 +241,7 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_sema;       /* sleep on this semaphore      : 20 */
+        sv_t               t_wait;       /* ticket wait queue            : 20 */
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
@@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header {
 *      xlog_rec_header_t into the reserved space.
 * - ic_data follows, so a write to disk can start at the beginning of
 *      the iclog.
- * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
 * - ic_next is the pointer to the next iclog in the ring.
 * - ic_bp is a pointer to the buffer used to write this incore log to disk.
 * - ic_log is a pointer back to the global log structure.
@@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_iclog_fields {
-        sv_t                    ic_forcesema;
+        sv_t                    ic_force_wait;
-        sv_t                    ic_writesema;
+        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -377,8 +377,8 @@ typedef struct xlog_in_core {
 /*
 * Defines to save our code from this glop.
 */
-#define ic_forcesema    hic_fields.ic_forcesema
+#define ic_force_wait   hic_fields.ic_force_wait
-#define ic_writesema    hic_fields.ic_writesema
+#define ic_write_wait   hic_fields.ic_write_wait
 #define ic_next         hic_fields.ic_next
 #define ic_prev         hic_fields.ic_prev
 #define ic_bp           hic_fields.ic_bp
@@ -448,7 +448,6 @@ typedef struct log {
        int                     l_grant_write_bytes;
 #ifdef XFS_LOG_TRACE
-        struct ktrace           *l_trace;
        struct ktrace           *l_grant_trace;
 #endif
@@ -468,7 +467,7 @@ extern int	 xlog_find_tail(xlog_t	*log,
                                xfs_daddr_t *head_blk,
                                xfs_daddr_t *tail_blk);
 extern int       xlog_recover(xlog_t *log);
-extern int       xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern void      xlog_recover_process_iunlinks(xlog_t *log);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9eb722ec744e..82d46ce69d5f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3940,8 +3940,7 @@ xlog_recover(
 */
 int
 xlog_recover_finish(
-        xlog_t          *log,
+        xlog_t          *log)
-        int             mfsi_flags)
 {
        /*
         * Now we're ready to do the transactions needed for the
@@ -3969,9 +3968,7 @@ xlog_recover_finish(
                xfs_log_force(log->l_mp, (xfs_lsn_t)0,
                              (XFS_LOG_FORCE | XFS_LOG_SYNC));
-                if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
+                xlog_recover_process_iunlinks(log);
-                        xlog_recover_process_iunlinks(log);
-                }
                xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6c5d1325e7f6..a4503f5e9497 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -128,7 +128,7 @@ static const struct {
 * initialized.
 */
 STATIC void
-xfs_mount_free(
+xfs_free_perag(
        xfs_mount_t     *mp)
 {
        if (mp->m_perag) {
@@ -139,20 +139,6 @@ xfs_mount_free(
                                kmem_free(mp->m_perag[agno].pagb_list);
                kmem_free(mp->m_perag);
        }
-        spinlock_destroy(&mp->m_ail_lock);
-        spinlock_destroy(&mp->m_sb_lock);
-        mutex_destroy(&mp->m_ilock);
-        mutex_destroy(&mp->m_growlock);
-        if (mp->m_quotainfo)
-                XFS_QM_DONE(mp);
-        if (mp->m_fsname != NULL)
-                kmem_free(mp->m_fsname);
-        if (mp->m_rtname != NULL)
-                kmem_free(mp->m_rtname);
-        if (mp->m_logname != NULL)
-                kmem_free(mp->m_logname);
 }
 /*
@@ -704,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 * Update alignment values based on mount options and sb values
 */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
-        if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+        if (mp->m_dalign) {
                /*
                 * If stripe unit and stripe width are not multiples
                 * of the fs blocksize turn off alignment.
@@ -864,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
 * Check that the data (and log if separate) are an ok size.
 */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
+xfs_check_sizes(xfs_mount_t *mp)
 {
        xfs_buf_t       *bp;
        xfs_daddr_t     d;
@@ -887,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
                return error;
        }
-        if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
+        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-            mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
@@ -923,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
 */
 int
 xfs_mountfs(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             mfsi_flags)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
        __uint64_t      resblks;
        __int64_t       update_flags = 0LL;
        uint            quotamount, quotaflags;
-        int             agno;
        int             uuid_mounted = 0;
        int             error = 0;
@@ -985,7 +968,7 @@ xfs_mountfs(
         * allocator alignment is within an ag, therefore ag has
         * to be aligned at stripe boundary.
         */
-        error = xfs_update_alignment(mp, mfsi_flags, &update_flags);
+        error = xfs_update_alignment(mp, &update_flags);
        if (error)
                goto error1;
@@ -1004,8 +987,7 @@ xfs_mountfs(
         * since a single partition filesystem is identical to a single
         * partition volume/filesystem.
         */
-        if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
+        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
-            (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
                if (xfs_uuid_mount(mp)) {
                        error = XFS_ERROR(EINVAL);
                        goto error1;
@@ -1033,7 +1015,7 @@ xfs_mountfs(
        /*
         * Check that the data (and log if separate) are an ok size.
         */
-        error = xfs_check_sizes(mp, mfsi_flags);
+        error = xfs_check_sizes(mp);
        if (error)
                goto error1;
@@ -1047,13 +1029,6 @@ xfs_mountfs(
        }
        /*
-         * For client case we are done now
-         */
-        if (mfsi_flags & XFS_MFSI_CLIENT) {
-                return 0;
-        }
-        /*
         *  Copies the low order bits of the timestamp and the randomly
         *  set "sequence" number out of a UUID.
         */
@@ -1077,8 +1052,10 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        init_rwsem(&mp->m_peraglock);
-        mp->m_perag =
+        mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
-                kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+                                  KM_MAYFAIL);
+        if (!mp->m_perag)
+                goto error1;
        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
@@ -1190,7 +1167,7 @@ xfs_mountfs(
         * delayed until after the root and real-time bitmap inodes
         * were consistently read in.
         */
-        error = xfs_log_mount_finish(mp, mfsi_flags);
+        error = xfs_log_mount_finish(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: log mount finish failed");
                goto error4;
@@ -1199,7 +1176,7 @@ xfs_mountfs(
        /*
         * Complete the quota initialisation, post-log-replay component.
         */
-        error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags);
+        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
        if (error)
                goto error4;
@@ -1233,12 +1210,7 @@ xfs_mountfs(
 error3:
        xfs_log_unmount_dealloc(mp);
 error2:
-        for (agno = 0; agno < sbp->sb_agcount; agno++)
+        xfs_free_perag(mp);
-                if (mp->m_perag[agno].pagb_list)
-                        kmem_free(mp->m_perag[agno].pagb_list);
-        kmem_free(mp->m_perag);
-        mp->m_perag = NULL;
-        /* FALLTHROUGH */
 error1:
        if (uuid_mounted)
                uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1246,16 +1218,17 @@ xfs_mountfs(
 }
 /*
- * xfs_unmountfs
- *
 * This flushes out the inodes,dquots and the superblock, unmounts the
 * log and makes sure that incore structures are freed.
 */
-int
+void
-xfs_unmountfs(xfs_mount_t *mp)
+xfs_unmountfs(
+        struct xfs_mount        *mp)
 {
-        __uint64_t      resblks;
+        __uint64_t              resblks;
-        int             error = 0;
+        int                     error;
+        IRELE(mp->m_rootip);
        /*
         * We can potentially deadlock here if we have an inode cluster
@@ -1312,8 +1285,6 @@ xfs_unmountfs(xfs_mount_t *mp)
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-        xfs_freesb(mp);
        /*
         * All inodes from this mount point should be freed.
         */
@@ -1322,11 +1293,12 @@ xfs_unmountfs(xfs_mount_t *mp)
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
-#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+#if defined(DEBUG)
        xfs_errortag_clearall(mp, 0);
 #endif
-        xfs_mount_free(mp);
+        xfs_free_perag(mp);
-        return 0;
+        if (mp->m_quotainfo)
+                XFS_QM_DONE(mp);
 }
 STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5269bd6e3df0..f3c1024b1241 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -114,7 +114,7 @@ struct xfs_dqtrxops;
 struct xfs_quotainfo;
 typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
+typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
 typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
@@ -158,8 +158,8 @@ typedef struct xfs_qmops {
 #define XFS_QM_INIT(mp, mnt, fl) \
        (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
+#define XFS_QM_MOUNT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags)
+        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
 #define XFS_QM_UNMOUNT(mp) \
        (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
 #define XFS_QM_DONE(mp) \
@@ -442,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 /*
 * Flags for xfs_mountfs
 */
-#define XFS_MFSI_SECOND         0x01    /* Secondary mount -- skip stuff */
-#define XFS_MFSI_CLIENT         0x02    /* Is a client -- skip lots of stuff */
-/*      XFS_MFSI_RRINODES       */
-#define XFS_MFSI_NOUNLINK       0x08    /* Skip unlinked inode processing in */
-                                        /* log recovery */
-#define XFS_MFSI_NO_QUOTACHECK  0x10    /* Skip quotacheck processing */
-/*      XFS_MFSI_CONVERT_SUNIT  */
 #define XFS_MFSI_QUIET          0x40    /* Be silent if mount errors found */
 #define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
@@ -517,10 +510,10 @@ typedef struct xfs_mod_sb {
 extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
-extern int      xfs_mountfs(xfs_mount_t *mp, int);
+extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
-extern int      xfs_unmountfs(xfs_mount_t *);
+extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
 extern int      xfs_unmount_flush(xfs_mount_t *, int);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index bf87a5913504..e2f68de16159 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
 */
 /*
- * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
- */
-STATIC int
-xfs_lowbit32(
-        __uint32_t      v)
-{
-        if (v)
-                return ffs(v) - 1;
-        return -1;
-}
-/*
 * Allocate space to the bitmap or summary file, and zero it, for growfs.
 */
 STATIC int                              /* error */
@@ -450,6 +438,7 @@ xfs_rtallocate_extent_near(
        }
        bbno = XFS_BITTOBLOCK(mp, bno);
        i = 0;
+        ASSERT(minlen != 0);
        log2len = xfs_highbit32(minlen);
        /*
         * Loop over all bitmap blocks (bbno + i is current block).
@@ -618,6 +607,8 @@ xfs_rtallocate_extent_size(
        xfs_suminfo_t   sum;            /* summary information for extents */
        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+        ASSERT(maxlen != 0);
        /*
         * Loop over all the levels starting with maxlen.
         * At each level, look at all the bitmap blocks, to see if there
@@ -675,6 +666,9 @@ xfs_rtallocate_extent_size(
                *rtblock = NULLRTBLOCK;
                return 0;
        }
+        ASSERT(minlen != 0);
+        ASSERT(maxlen != 0);
        /*
         * Loop over sizes, from maxlen down to minlen.
         * This time, when we do the allocations, allow smaller ones
@@ -1961,6 +1955,7 @@ xfs_growfs_rt(
                                  nsbp->sb_blocksize * nsbp->sb_rextsize);
                nsbp->sb_rextents = nsbp->sb_rblocks;
                do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+                ASSERT(nsbp->sb_rextents != 0);
                nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
                nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
                nrsumsize =
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index b0f31c09a76d..3a82576dde9a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -314,7 +314,7 @@ xfs_bioerror_relse(
                 * ASYNC buffers.
                 */
                XFS_BUF_ERROR(bp, EIO);
-                XFS_BUF_V_IODONESEMA(bp);
+                XFS_BUF_FINISH_IOWAIT(bp);
        } else {
                xfs_buf_relse(bp);
        }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index e4ebddd3c500..4e1c22a23be5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -43,6 +43,7 @@
 #include "xfs_quota.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
@@ -253,7 +254,7 @@ _xfs_trans_alloc(
        tp->t_mountp = mp;
        tp->t_items_free = XFS_LIC_NUM_SLOTS;
        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
-        XFS_LIC_INIT(&(tp->t_items));
+        xfs_lic_init(&(tp->t_items));
        XFS_LBC_INIT(&(tp->t_busy));
        return tp;
 }
@@ -282,7 +283,7 @@ xfs_trans_dup(
        ntp->t_mountp = tp->t_mountp;
        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
-        XFS_LIC_INIT(&(ntp->t_items));
+        xfs_lic_init(&(ntp->t_items));
        XFS_LBC_INIT(&(ntp->t_busy));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1169,7 +1170,7 @@ xfs_trans_cancel(
                while (licp != NULL) {
                        lidp = licp->lic_descs;
                        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                                if (XFS_LIC_ISFREE(licp, i)) {
+                                if (xfs_lic_isfree(licp, i)) {
                                        continue;
                                }
@@ -1216,6 +1217,68 @@ xfs_trans_free(
        kmem_zone_free(xfs_trans_zone, tp);
 }
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+        struct xfs_trans        **tpp,
+        struct xfs_inode        *dp)
+{
+        struct xfs_trans        *trans;
+        unsigned int            logres, count;
+        int                     error;
+        /*
+         * Ensure that the inode is always logged.
+         */
+        trans = *tpp;
+        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+        /*
+         * Copy the critical parameters from one trans to the next.
+         */
+        logres = trans->t_log_res;
+        count = trans->t_log_count;
+        *tpp = xfs_trans_dup(trans);
+        /*
+         * Commit the current transaction.
+         * If this commit failed, then it'd just unlock those items that
+         * are not marked ihold. That also means that a filesystem shutdown
+         * is in progress. The caller takes the responsibility to cancel
+         * the duplicate transaction that gets returned.
+         */
+        error = xfs_trans_commit(trans, 0);
+        if (error)
+                return (error);
+        trans = *tpp;
+        /*
+         * Reserve space in the log for th next transaction.
+         * This also pushes items in the "AIL", the list of logged items,
+         * out to disk if they are taking up space at the tail of the log
+         * that we want to use.  This requires that either nothing be locked
+         * across this call, or that anything that is locked be logged in
+         * the prior and the next transactions.
+         */
+        error = xfs_trans_reserve(trans, 0, logres, 0,
+                                  XFS_TRANS_PERM_LOG_RES, count);
+        /*
+         *  Ensure that the inode is in the new transaction and locked.
+         */
+        if (error)
+                return error;
+        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(trans, dp);
+        return 0;
+}
 /*
 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
@@ -1253,7 +1316,7 @@ xfs_trans_committed(
         * Special case the chunk embedded in the transaction.
         */
        licp = &(tp->t_items);
-        if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+        if (!(xfs_lic_are_all_free(licp))) {
                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
        }
@@ -1262,7 +1325,7 @@ xfs_trans_committed(
         */
        licp = licp->lic_next;
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
                next_licp = licp->lic_next;
                kmem_free(licp);
@@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed(
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0804207c7391..74c80bd2b0ec 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk {
 * lic_unused to the right value (0 matches all free).  The
 * lic_descs.lid_index values are set up as each desc is allocated.
 */
-#define XFS_LIC_INIT(cp)        xfs_lic_init(cp)
 static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
 {
        cp->lic_free = XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_INIT_SLOT(cp,slot)      xfs_lic_init_slot(cp, slot)
 static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_descs[slot].lid_index = (unsigned char)(slot);
 }
-#define XFS_LIC_VACANCY(cp)             xfs_lic_vacancy(cp)
 static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
 {
        return cp->lic_free & XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_ALL_FREE(cp)            xfs_lic_all_free(cp)
 static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
 {
        cp->lic_free = XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_ARE_ALL_FREE(cp)        xfs_lic_are_all_free(cp)
 static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
 {
        return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
 }
-#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
 static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
 {
        return (cp->lic_free & (1 << slot));
 }
-#define XFS_LIC_CLAIM(cp,slot)          xfs_lic_claim(cp,slot)
 static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_free &= ~(1 << slot);
 }
-#define XFS_LIC_RELSE(cp,slot)          xfs_lic_relse(cp,slot)
 static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_free |= 1 << slot;
 }
-#define XFS_LIC_SLOT(cp,slot)           xfs_lic_slot(cp,slot)
 static inline xfs_log_item_desc_t *
 xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
 {
        return &(cp->lic_descs[slot]);
 }
-#define XFS_LIC_DESC_TO_SLOT(dp)        xfs_lic_desc_to_slot(dp)
 static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 {
        return (uint)dp->lid_index;
@@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 * All of this yields the address of the chunk, which is
 * cast to a chunk pointer.
 */
-#define XFS_LIC_DESC_TO_CHUNK(dp)       xfs_lic_desc_to_chunk(dp)
 static inline xfs_log_item_chunk_t *
 xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 {
@@ -986,6 +975,7 @@ int		_xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
+int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
 void            xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cb0c5839154b..4e855b5ced66 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match(
        bp = NULL;
        len = BBTOB(len);
        licp = &tp->t_items;
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                for (i = 0; i < licp->lic_unused; i++) {
                        /*
                         * Skip unoccupied slots.
                         */
-                        if (XFS_LIC_ISFREE(licp, i)) {
+                        if (xfs_lic_isfree(licp, i)) {
                                continue;
                        }
-                        lidp = XFS_LIC_SLOT(licp, i);
+                        lidp = xfs_lic_slot(licp, i);
                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
                        if (blip->bli_item.li_type != XFS_LI_BUF) {
                                continue;
@@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all(
        bp = NULL;
        len = BBTOB(len);
        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                if (xfs_lic_are_all_free(licp)) {
                        ASSERT(licp == &tp->t_items);
                        ASSERT(licp->lic_next == NULL);
                        return NULL;
@@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all(
                        /*
                         * Skip unoccupied slots.
                         */
-                        if (XFS_LIC_ISFREE(licp, i)) {
+                        if (xfs_lic_isfree(licp, i)) {
                                continue;
                        }
-                        lidp = XFS_LIC_SLOT(licp, i);
+                        lidp = xfs_lic_slot(licp, i);
                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
                        if (blip->bli_item.li_type != XFS_LI_BUF) {
                                continue;
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index db5c83595526..3c666e8317f8 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                 * Initialize the chunk, and then
                 * claim the first slot in the newly allocated chunk.
                 */
-                XFS_LIC_INIT(licp);
+                xfs_lic_init(licp);
-                XFS_LIC_CLAIM(licp, 0);
+                xfs_lic_claim(licp, 0);
                licp->lic_unused = 1;
-                XFS_LIC_INIT_SLOT(licp, 0);
+                xfs_lic_init_slot(licp, 0);
-                lidp = XFS_LIC_SLOT(licp, 0);
+                lidp = xfs_lic_slot(licp, 0);
                /*
                 * Link in the new chunk and update the free count.
@@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
         */
        licp = &tp->t_items;
        while (licp != NULL) {
-                if (XFS_LIC_VACANCY(licp)) {
+                if (xfs_lic_vacancy(licp)) {
                        if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
                                i = licp->lic_unused;
-                                ASSERT(XFS_LIC_ISFREE(licp, i));
+                                ASSERT(xfs_lic_isfree(licp, i));
                                break;
                        }
                        for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-                                if (XFS_LIC_ISFREE(licp, i))
+                                if (xfs_lic_isfree(licp, i))
                                        break;
                        }
                        ASSERT(i <= XFS_LIC_MAX_SLOT);
@@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
         * If we find a free descriptor, claim it,
         * initialize it, and return it.
         */
-        XFS_LIC_CLAIM(licp, i);
+        xfs_lic_claim(licp, i);
        if (licp->lic_unused <= i) {
                licp->lic_unused = i + 1;
-                XFS_LIC_INIT_SLOT(licp, i);
+                xfs_lic_init_slot(licp, i);
        }
-        lidp = XFS_LIC_SLOT(licp, i);
+        lidp = xfs_lic_slot(licp, i);
        tp->t_items_free--;
        lidp->lid_item = lip;
        lidp->lid_flags = 0;
@@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
        xfs_log_item_chunk_t    *licp;
        xfs_log_item_chunk_t    **licpp;
-        slot = XFS_LIC_DESC_TO_SLOT(lidp);
+        slot = xfs_lic_desc_to_slot(lidp);
-        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        licp = xfs_lic_desc_to_chunk(lidp);
-        XFS_LIC_RELSE(licp, slot);
+        xfs_lic_relse(licp, slot);
        lidp->lid_item->li_desc = NULL;
        tp->t_items_free++;
@@ -154,7 +154,7 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
         * Also decrement the transaction structure's count of free items
         * by the number in a chunk since we are freeing an empty chunk.
         */
-        if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+        if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
                licpp = &(tp->t_items.lic_next);
                while (*licpp != licp) {
                        ASSERT(*licpp != NULL);
@@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp)
        /*
         * If it's not in the first chunk, skip to the second.
         */
-        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (xfs_lic_are_all_free(licp)) {
                licp = licp->lic_next;
        }
        /*
         * Return the first non-free descriptor in the chunk.
         */
-        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        ASSERT(!xfs_lic_are_all_free(licp));
        for (i = 0; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
        return NULL;
@@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
        xfs_log_item_chunk_t    *licp;
        int                     i;
-        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        licp = xfs_lic_desc_to_chunk(lidp);
        /*
         * First search the rest of the chunk. The for loop keeps us
         * from referencing things beyond the end of the chunk.
         */
-        for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
+        for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        /*
@@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
        }
        licp = licp->lic_next;
-        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        ASSERT(!xfs_lic_are_all_free(licp));
        for (i = 0; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        ASSERT(0);
        /* NOTREACHED */
@@ -300,9 +300,9 @@ xfs_trans_free_items(
        /*
         * Special case the embedded chunk so we don't free it below.
         */
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
-                XFS_LIC_ALL_FREE(licp);
+                xfs_lic_all_free(licp);
                licp->lic_unused = 0;
        }
        licp = licp->lic_next;
@@ -311,7 +311,7 @@ xfs_trans_free_items(
         * Unlock each item in each chunk and free the chunks.
         */
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
                next_licp = licp->lic_next;
                kmem_free(licp);
@@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
        /*
         * Special case the embedded chunk so we don't free.
         */
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
        }
        licpp = &(tp->t_items.lic_next);
@@ -358,10 +358,10 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
         * and free empty chunks.
         */
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
                next_licp = licp->lic_next;
-                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                if (xfs_lic_are_all_free(licp)) {
                        *licpp = next_licp;
                        kmem_free(licp);
                        freed -= XFS_LIC_NUM_SLOTS;
@@ -402,7 +402,7 @@ xfs_trans_unlock_chunk(
        freed = 0;
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
                lip = lidp->lid_item;
@@ -421,7 +421,7 @@ xfs_trans_unlock_chunk(
                 */
                if (!(freeing_chunk) &&
                    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-                        XFS_LIC_RELSE(licp, i);
+                        xfs_lic_relse(licp, i);
                        freed++;
                }
        }
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 98e5f110ba5f..35d4d414bcc2 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -237,7 +237,7 @@ xfs_droplink(
        ASSERT (ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink--;
-        drop_nlink(ip->i_vnode);
+        drop_nlink(VFS_I(ip));
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        error = 0;
@@ -301,7 +301,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
-        inc_nlink(ip->i_vnode);
+        inc_nlink(VFS_I(ip));
        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f316cb85d8e2..ef321225d269 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,9 +18,6 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
-#define IRELE(ip)       VN_RELE(XFS_ITOV(ip))
-#define IHOLD(ip)       VN_HOLD(XFS_ITOV(ip))
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 4a9a43315a86..439dd3939dda 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -128,7 +128,6 @@ xfs_unmount_flush(
        xfs_inode_t     *rip = mp->m_rootip;
        xfs_inode_t     *rbmip;
        xfs_inode_t     *rsumip = NULL;
-        bhv_vnode_t     *rvp = XFS_ITOV(rip);
        int             error;
        xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -146,7 +145,7 @@ xfs_unmount_flush(
                if (error == EFSCORRUPTED)
                        goto fscorrupt_out;
-                ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+                ASSERT(vn_count(VFS_I(rbmip)) == 1);
                rsumip = mp->m_rsumip;
                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
@@ -157,7 +156,7 @@ xfs_unmount_flush(
                if (error == EFSCORRUPTED)
                        goto fscorrupt_out;
-                ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+                ASSERT(vn_count(VFS_I(rsumip)) == 1);
        }
        /*
@@ -167,7 +166,7 @@ xfs_unmount_flush(
        if (error == EFSCORRUPTED)
                goto fscorrupt_out2;
-        if (vn_count(rvp) != 1 && !relocation) {
+        if (vn_count(VFS_I(rip)) != 1 && !relocation) {
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                return XFS_ERROR(EBUSY);
        }
@@ -284,7 +283,7 @@ xfs_sync_inodes(
        int             *bypassed)
 {
        xfs_inode_t     *ip = NULL;
-        bhv_vnode_t     *vp = NULL;
+        struct inode    *vp = NULL;
        int             error;
        int             last_error;
        uint64_t        fflag;
@@ -404,7 +403,7 @@ xfs_sync_inodes(
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                vp = VFS_I(ip);
                /*
                 * If the vnode is gone then this is being torn down,
@@ -479,7 +478,7 @@ xfs_sync_inodes(
                        IPOINTER_INSERT(ip, mp);
                        xfs_ilock(ip, lock_flags);
-                        ASSERT(vp == XFS_ITOV(ip));
+                        ASSERT(vp == VFS_I(ip));
                        ASSERT(ip->i_mount == mp);
                        vnode_refed = B_TRUE;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 76a1166af822..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -83,7 +83,7 @@ xfs_setattr(
        cred_t                  *credp)
 {
        xfs_mount_t             *mp = ip->i_mount;
-        struct inode            *inode = XFS_ITOV(ip);
+        struct inode            *inode = VFS_I(ip);
        int                     mask = iattr->ia_valid;
        xfs_trans_t             *tp;
        int                     code;
@@ -182,7 +182,7 @@ xfs_setattr(
        xfs_ilock(ip, lock_flags);
        /* boolean: are we the file owner? */
-        file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
+        file_owner = (current_fsuid() == ip->i_d.di_uid);
        /*
         * Change various properties of a file.
@@ -513,7 +513,6 @@ xfs_setattr(
                        ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
                        ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
                        ip->i_update_core = 1;
-                        timeflags &= ~XFS_ICHGTIME_ACC;
                }
                if (mask & ATTR_MTIME) {
                        inode->i_mtime = iattr->ia_mtime;
@@ -714,7 +713,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
        /* capture size updates in I/O completion before writing the inode. */
-        error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        error = filemap_fdatawait(VFS_I(ip)->i_mapping);
        if (error)
                return XFS_ERROR(error);
@@ -1160,7 +1159,6 @@ int
 xfs_release(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_mount_t     *mp = ip->i_mount;
        int             error;
@@ -1195,13 +1193,13 @@ xfs_release(
                 * be exposed to that problem.
                 */
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-                if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+                if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
        }
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
                    (!(ip->i_d.di_flags &
@@ -1227,7 +1225,6 @@ int
 xfs_inactive(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_bmap_free_t free_list;
        xfs_fsblock_t   first_block;
        int             committed;
@@ -1242,7 +1239,7 @@ xfs_inactive(
         * If the inode is already free, then there can be nothing
         * to clean up here.
         */
-        if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
+        if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
                ASSERT(ip->i_df.if_real_bytes == 0);
                ASSERT(ip->i_df.if_broot_bytes == 0);
                return VN_INACTIVE_CACHE;
@@ -1272,7 +1269,7 @@ xfs_inactive(
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
                     (!(ip->i_d.di_flags &
@@ -1536,7 +1533,7 @@ xfs_create(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -1708,111 +1705,6 @@ std_return:
 }
 #ifdef DEBUG
-/*
- * Some counters to see if (and how often) we are hitting some deadlock
- * prevention code paths.
- */
-int xfs_rm_locks;
-int xfs_rm_lock_delays;
-int xfs_rm_attempts;
-#endif
-/*
- * The following routine will lock the inodes associated with the
- * directory and the named entry in the directory. The locks are
- * acquired in increasing inode number.
- *
- * If the entry is "..", then only the directory is locked. The
- * vnode ref count will still include that from the .. entry in
- * this case.
- *
- * There is a deadlock we need to worry about. If the locked directory is
- * in the AIL, it might be blocking up the log. The next inode we lock
- * could be already locked by another thread waiting for log space (e.g
- * a permanent log reservation with a long running transaction (see
- * xfs_itruncate_finish)). To solve this, we must check if the directory
- * is in the ail and use lock_nowait. If we can't lock, we need to
- * drop the inode lock on the directory and try again. xfs_iunlock will
- * potentially push the tail if we were holding up the log.
- */
-STATIC int
-xfs_lock_dir_and_entry(
-        xfs_inode_t     *dp,
-        xfs_inode_t     *ip)    /* inode of entry 'name' */
-{
-        int             attempts;
-        xfs_ino_t       e_inum;
-        xfs_inode_t     *ips[2];
-        xfs_log_item_t  *lp;
-#ifdef DEBUG
-        xfs_rm_locks++;
-#endif
-        attempts = 0;
-again:
-        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        e_inum = ip->i_ino;
-        xfs_itrace_ref(ip);
-        /*
-         * We want to lock in increasing inum. Since we've already
-         * acquired the lock on the directory, we may need to release
-         * if if the inum of the entry turns out to be less.
-         */
-        if (e_inum > dp->i_ino) {
-                /*
-                 * We are already in the right order, so just
-                 * lock on the inode of the entry.
-                 * We need to use nowait if dp is in the AIL.
-                 */
-                lp = (xfs_log_item_t *)dp->i_itemp;
-                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                attempts++;
-#ifdef DEBUG
-                                xfs_rm_attempts++;
-#endif
-                                /*
-                                 * Unlock dp and try again.
-                                 * xfs_iunlock will try to push the tail
-                                 * if the inode is in the AIL.
-                                 */
-                                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                                if ((attempts % 5) == 0) {
-                                        delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
-                                        xfs_rm_lock_delays++;
-#endif
-                                }
-                                goto again;
-                        }
-                } else {
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                }
-        } else if (e_inum < dp->i_ino) {
-                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                ips[0] = ip;
-                ips[1] = dp;
-                xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
-        }
-        /* else  e_inum == dp->i_ino */
-        /*     This can happen if we're asked to lock /x/..
-         *     the entry is "..", which is also the parent directory.
-         */
-        return 0;
-}
-#ifdef DEBUG
 int xfs_locked_n;
 int xfs_small_retries;
 int xfs_middle_retries;
@@ -1946,6 +1838,53 @@ again:
 #endif
 }
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+        xfs_inode_t             *ip0,
+        xfs_inode_t             *ip1,
+        uint                    lock_mode)
+{
+        xfs_inode_t             *temp;
+        int                     attempts = 0;
+        xfs_log_item_t          *lp;
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+                ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+        ASSERT(ip0->i_ino != ip1->i_ino);
+        if (ip0->i_ino > ip1->i_ino) {
+                temp = ip0;
+                ip0 = ip1;
+                ip1 = temp;
+        }
+ again:
+        xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+        /*
+         * If the first lock we have locked is in the AIL, we must TRY to get
+         * the second lock. If we can't get it, we must release the first one
+         * and try again.
+         */
+        lp = (xfs_log_item_t *)ip0->i_itemp;
+        if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+                        xfs_iunlock(ip0, lock_mode);
+                        if ((++attempts % 5) == 0)
+                                delay(1); /* Don't just spin the CPU */
+                        goto again;
+                }
+        } else {
+                xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+        }
+}
 int
 xfs_remove(
        xfs_inode_t             *dp,
@@ -2018,9 +1957,7 @@ xfs_remove(
                goto out_trans_cancel;
        }
-        error = xfs_lock_dir_and_entry(dp, ip);
+        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-        if (error)
-                goto out_trans_cancel;
        /*
         * At this point, we've gotten both the directory and the entry
@@ -2047,9 +1984,6 @@ xfs_remove(
                }
        }
-        /*
-         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
-         */
        XFS_BMAP_INIT(&free_list, &first_block);
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
@@ -2155,7 +2089,6 @@ xfs_link(
 {
        xfs_mount_t             *mp = tdp->i_mount;
        xfs_trans_t             *tp;
-        xfs_inode_t             *ips[2];
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
@@ -2203,15 +2136,7 @@ xfs_link(
                goto error_return;
        }
-        if (sip->i_ino < tdp->i_ino) {
+        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-                ips[0] = sip;
-                ips[1] = tdp;
-        } else {
-                ips[0] = tdp;
-                ips[1] = sip;
-        }
-        xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
        /*
         * Increment vnode ref counts since xfs_trans_commit &
@@ -2352,7 +2277,7 @@ xfs_mkdir(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2578,7 +2503,7 @@ xfs_symlink(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2873,14 +2798,13 @@ int
 xfs_reclaim(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_itrace_entry(ip);
-        ASSERT(!VN_MAPPED(vp));
+        ASSERT(!VN_MAPPED(VFS_I(ip)));
        /* bad inode, get out here ASAP */
-        if (VN_BAD(vp)) {
+        if (VN_BAD(VFS_I(ip))) {
                xfs_ireclaim(ip);
                return 0;
        }
@@ -2917,7 +2841,7 @@ xfs_reclaim(
                XFS_MOUNT_ILOCK(mp);
                spin_lock(&ip->i_flags_lock);
                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                vn_to_inode(vp)->i_private = NULL;
+                VFS_I(ip)->i_private = NULL;
                ip->i_vnode = NULL;
                spin_unlock(&ip->i_flags_lock);
                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
@@ -2933,7 +2857,7 @@ xfs_finish_reclaim(
        int             sync_mode)
 {
        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
+        struct inode    *vp = VFS_I(ip);
        if (vp && VN_BAD(vp))
                goto reclaim;
@@ -3236,6 +3160,13 @@ error1:	/* Just cancel transaction */
 /*
 * Zero file bytes between startoff and endoff inclusive.
 * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
 */
 STATIC int
 xfs_zero_remaining_bytes(
@@ -3252,6 +3183,17 @@ xfs_zero_remaining_bytes(
        int                     nimap;
        int                     error = 0;
+        /*
+         * Avoid doing I/O beyond eof - it's not necessary
+         * since nothing can read beyond eof.  The space will
+         * be zeroed when the file is extended anyway.
+         */
+        if (startoff >= ip->i_size)
+                return 0;
+        if (endoff > ip->i_size)
+                endoff = ip->i_size;
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);
@@ -3321,7 +3263,6 @@ xfs_free_file_space(
        xfs_off_t               len,
        int                     attr_flags)
 {
-        bhv_vnode_t             *vp;
        int                     committed;
        int                     done;
        xfs_off_t               end_dmi_offset;
@@ -3341,7 +3282,6 @@ xfs_free_file_space(
        xfs_trans_t             *tp;
        int                     need_iolock = 1;
-        vp = XFS_ITOV(ip);
        mp = ip->i_mount;
        xfs_itrace_entry(ip);
@@ -3378,7 +3318,7 @@ xfs_free_file_space(
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
-        if (VN_CACHED(vp) != 0) {
+        if (VN_CACHED(VFS_I(ip)) != 0) {
                xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
                error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
                if (error)