403 files changed, 12404 insertions, 9576 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 88e3787c6ea9..e298fe194093 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 const struct file_operations v9fs_dir_operations = {
        .read = generic_read_dir,
+        .llseek = generic_file_llseek,
        .readdir = v9fs_dir_readdir,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
diff --git a/fs/Kconfig b/fs/Kconfig
index 37db79a2ff95..abccb5dab9a8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -902,65 +902,7 @@ endif # BLOCK
 menu "Pseudo filesystems"
-config PROC_FS
+source "fs/proc/Kconfig"
-        bool "/proc file system support" if EMBEDDED
-        default y
-        help
-          This is a virtual file system providing information about the status
-          of the system. "Virtual" means that it doesn't take up any space on
-          your hard disk: the files are created on the fly by the kernel when
-          you try to access them. Also, you cannot read the files with older
-          version of the program less: you need to use more or cat.
-          It's totally cool; for example, "cat /proc/interrupts" gives
-          information about what the different IRQs are used for at the moment
-          (there is a small number of Interrupt ReQuest lines in your computer
-          that are used by the attached devices to gain the CPU's attention --
-          often a source of trouble if two devices are mistakenly configured
-          to use the same IRQ). The program procinfo to display some
-          information about your system gathered from the /proc file system.
-          Before you can use the /proc file system, it has to be mounted,
-          meaning it has to be given a location in the directory hierarchy.
-          That location should be /proc. A command such as "mount -t proc proc
-          /proc" or the equivalent line in /etc/fstab does the job.
-          The /proc file system is explained in the file
-          <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
-          ("man 5 proc").
-          This option will enlarge your kernel by about 67 KB. Several
-          programs depend on this, so everyone should say Y here.
-config PROC_KCORE
-        bool "/proc/kcore support" if !ARM
-        depends on PROC_FS && MMU
-config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && CRASH_DUMP
-        default y
-        help
-        Exports the dump image of crashed kernel in ELF format.
-config PROC_SYSCTL
-        bool "Sysctl support (/proc/sys)" if EMBEDDED
-        depends on PROC_FS
-        select SYSCTL
-        default y
-        ---help---
-          The sysctl interface provides a means of dynamically changing
-          certain kernel parameters and variables on the fly without requiring
-          a recompile of the kernel or reboot of the system.  The primary
-          interface is through /proc/sys.  If you say Y here a tree of
-          modifiable sysctl entries will be generated beneath the
-          /proc/sys directory. They are explained in the files
-          in <file:Documentation/sysctl/>.  Note that enabling this
-          option will enlarge the kernel by at least 8 KB.
-          As it is generally a good thing, you should say Y here unless
-          building a kernel for install/rescue disks or your system is very
-          limited in memory.
 config SYSFS
        bool "sysfs file system support" if EMBEDDED
@@ -1441,6 +1383,19 @@ config MINIX_FS
          partition (the one containing the directory /) cannot be compiled as
          a module.
+config OMFS_FS
+        tristate "SonicBlue Optimized MPEG File System support"
+        depends on BLOCK
+        select CRC_ITU_T
+        help
+          This is the proprietary file system used by the Rio Karma music
+          player and ReplayTV DVR.  Despite the name, this filesystem is not
+          more efficient than a standard FS for MPEG files, in fact likely
+          the opposite is true.  Say Y if you have either of these devices
+          and wish to mount its disk.
+          To compile this file system support as a module, choose M here: the
+          module will be called omfs.  If unsure, say N.
 config HPFS_FS
        tristate "OS/2 HPFS file system support"
@@ -1975,6 +1930,16 @@ config CIFS_WEAK_PW_HASH
          If unsure, say N.
+config CIFS_UPCALL
+          bool "Kerberos/SPNEGO advanced session setup"
+          depends on CIFS && KEYS
+          help
+            Enables an upcall mechanism for CIFS which accesses
+            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+            Kerberos tickets which are needed to mount to certain secure servers
+            (for which more secure Kerberos authentication is required). If
+            unsure, say N.
 config CIFS_XATTR
        bool "CIFS extended attributes"
        depends on CIFS
@@ -2027,17 +1992,6 @@ config CIFS_EXPERIMENTAL
            (which is disabled by default). See the file fs/cifs/README 
            for more details.  If unsure, say N.
-config CIFS_UPCALL
-          bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
-          depends on CIFS_EXPERIMENTAL
-          depends on KEYS
-          help
-            Enables an upcall mechanism for CIFS which accesses
-            userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-            Kerberos tickets which are needed to mount to certain secure servers
-            (for which more secure Kerberos authentication is required). If
-            unsure, say N.
 config CIFS_DFS_UPCALL
          bool "DFS feature support (EXPERIMENTAL)"
          depends on CIFS_EXPERIMENTAL
@@ -2093,20 +2047,6 @@ config CODA_FS
          To compile the coda client support as a module, choose M here: the
          module will be called coda.
-config CODA_FS_OLD_API
-        bool "Use 96-bit Coda file identifiers"
-        depends on CODA_FS
-        help
-          A new kernel-userspace API had to be introduced for Coda v6.0
-          to support larger 128-bit file identifiers as needed by the
-          new realms implementation.
-          However this new API is not backward compatible with older
-          clients. If you really need to run the old Coda userspace
-          cache manager then say Y.
-          For most cases you probably want to say N.
 config AFS_FS
        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
        depends on INET && EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 3263084eef9e..4a551af6f3fc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -30,7 +30,7 @@ config COMPAT_BINFMT_ELF
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
-        depends on (FRV || BLACKFIN)
+        depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
        help
          ELF FDPIC binaries are based on ELF, but allow the individual load
          segments of a binary to be located in memory independently of each
diff --git a/fs/Makefile b/fs/Makefile
index 3b2178b4bb66..a1482a5eff15 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -111,6 +111,7 @@ obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)           += fuse/
 obj-$(CONFIG_UDF_FS)            += udf/
 obj-$(CONFIG_SUN_OPENPROMFS)    += openpromfs/
+obj-$(CONFIG_OMFS_FS)           += omfs/
 obj-$(CONFIG_JFS_FS)            += jfs/
 obj-$(CONFIG_XFS_FS)            += xfs/
 obj-$(CONFIG_9P_FS)             += 9p/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fc1a8dc64d78..85a30e929800 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,6 +197,7 @@ out:
 const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
+        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
        .fsync          = file_fsync,
 };
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672b..26f3b43726bb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
        kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 223b1917093e..e9ec915f7553 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -2,6 +2,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/amigaffs.h>
+#include <linux/mutex.h>
 /* AmigaOS allows file names with up to 30 characters length.
 * Names longer than that will be silently truncated. If you
@@ -98,7 +99,7 @@ struct affs_sb_info {
        gid_t s_gid;                    /* gid to override */
        umode_t s_mode;                 /* mode to override */
        struct buffer_head *s_root_bh;  /* Cached root block. */
-        struct semaphore s_bmlock;      /* Protects bitmap access. */
+        struct mutex s_bmlock;          /* Protects bitmap access. */
        struct affs_bm_info *s_bitmap;  /* Bitmap infos. */
        u32 s_bmap_count;               /* # of bitmap blocks. */
        u32 s_bmap_bits;                /* # of bits in one bitmap blocks */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c4a5ad09ddf2..dc5ef14bdc1c 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -45,14 +45,14 @@ affs_count_free_blocks(struct super_block *sb)
        if (sb->s_flags & MS_RDONLY)
                return 0;
-        down(&AFFS_SB(sb)->s_bmlock);
+        mutex_lock(&AFFS_SB(sb)->s_bmlock);
        bm = AFFS_SB(sb)->s_bitmap;
        free = 0;
        for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--)
                free += bm->bm_free;
-        up(&AFFS_SB(sb)->s_bmlock);
+        mutex_unlock(&AFFS_SB(sb)->s_bmlock);
        return free;
 }
@@ -76,7 +76,7 @@ affs_free_block(struct super_block *sb, u32 block)
        bit     = blk % sbi->s_bmap_bits;
        bm      = &sbi->s_bitmap[bmap];
-        down(&sbi->s_bmlock);
+        mutex_lock(&sbi->s_bmlock);
        bh = sbi->s_bmap_bh;
        if (sbi->s_last_bmap != bmap) {
@@ -105,19 +105,19 @@ affs_free_block(struct super_block *sb, u32 block)
        sb->s_dirt = 1;
        bm->bm_free++;
-        up(&sbi->s_bmlock);
+        mutex_unlock(&sbi->s_bmlock);
        return;
 err_free:
        affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block);
-        up(&sbi->s_bmlock);
+        mutex_unlock(&sbi->s_bmlock);
        return;
 err_bh_read:
        affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key);
        sbi->s_bmap_bh = NULL;
        sbi->s_last_bmap = ~0;
-        up(&sbi->s_bmlock);
+        mutex_unlock(&sbi->s_bmlock);
        return;
 err_range:
@@ -168,7 +168,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
        bmap = blk / sbi->s_bmap_bits;
        bm = &sbi->s_bitmap[bmap];
-        down(&sbi->s_bmlock);
+        mutex_lock(&sbi->s_bmlock);
        if (bm->bm_free)
                goto find_bmap_bit;
@@ -249,7 +249,7 @@ find_bit:
        mark_buffer_dirty(bh);
        sb->s_dirt = 1;
-        up(&sbi->s_bmlock);
+        mutex_unlock(&sbi->s_bmlock);
        pr_debug("%d\n", blk);
        return blk;
@@ -259,7 +259,7 @@ err_bh_read:
        sbi->s_bmap_bh = NULL;
        sbi->s_last_bmap = ~0;
 err_full:
-        up(&sbi->s_bmlock);
+        mutex_unlock(&sbi->s_bmlock);
        pr_debug("failed\n");
        return 0;
 }
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 6e3f282424b0..7b36904dbeac 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);
 const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
+        .llseek         = generic_file_llseek,
        .readdir        = affs_readdir,
        .fsync          = file_fsync,
 };
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 6eac7bdeec94..1377b1240b6e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -46,8 +46,6 @@ const struct inode_operations affs_file_inode_operations = {
 static int
 affs_file_open(struct inode *inode, struct file *filp)
 {
-        if (atomic_read(&filp->f_count) != 1)
-                return 0;
        pr_debug("AFFS: open(%lu,%d)\n",
                 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
        atomic_inc(&AFFS_I(inode)->i_opencnt);
@@ -57,8 +55,6 @@ affs_file_open(struct inode *inode, struct file *filp)
 static int
 affs_file_release(struct inode *inode, struct file *filp)
 {
-        if (atomic_read(&filp->f_count) != 0)
-                return 0;
        pr_debug("AFFS: release(%lu, %d)\n",
                 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d214837d5e42..3a89094f93d0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
        kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -290,7 +290,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        init_MUTEX(&sbi->s_bmlock);
+        mutex_init(&sbi->s_bmlock);
        if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
                                &blocksize,&sbi->s_prefix,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7102824ba847..3cb6920ff30b 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -469,8 +469,6 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct file_operations afs_dir_file_operations;
-extern int afs_permission(struct inode *, int, struct nameidata *);
 /*
 * file.c
 */
@@ -605,7 +603,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int, struct nameidata *);
+extern int afs_permission(struct inode *, int);
 /*
 * server.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2f5503902c37..78db4953a800 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -232,7 +232,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
        }
        mntget(newmnt);
-        err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
+        err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
        switch (err) {
        case 0:
                path_put(&nd->path);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3bcbeceba1bb..3ef504370034 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -284,7 +284,7 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 * - AFS ACLs are attached to directories only, and a file is controlled by its
 *   parent directory's ACL
 */
-int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int afs_permission(struct inode *inode, int mask)
 {
        struct afs_vnode *vnode = AFS_FS_I(inode);
        afs_access_t uninitialized_var(access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef6818..250d8c4d66e4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
 static int afs_get_sb(struct file_system_type *fs_type,
                      int flags, const char *dev_name,
                      void *data, struct vfsmount *mnt);
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
 /*
 * initialise an inode cache slab element prior to any use
 */
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
 {
        struct afs_vnode *vnode = _vnode;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9a849ad3c489..065b4e10681a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -404,7 +404,7 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
                        page = pages[loop];
                        if (page->index > wb->last)
                                break;
-                        if (TestSetPageLocked(page))
+                        if (!trylock_page(page))
                                break;
                        if (!PageDirty(page) ||
                            page_private(page) != (unsigned long) wb) {
diff --git a/fs/aio.c b/fs/aio.c
index 0fb3117ddd93..f658441d5666 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,8 +512,8 @@ static void aio_fput_routine(struct work_struct *data)
 */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-        dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
+        dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
-                req, atomic_read(&req->ki_filp->f_count));
+                req, atomic_long_read(&req->ki_filp->f_count));
        assert_spin_locked(&ctx->ctx_lock);
@@ -528,7 +528,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
        /* Must be done under the lock to serialise against cancellation.
         * Call this aio_fput as it duplicates fput via the fput_work.
         */
-        if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
@@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm)
        struct task_struct *tsk = current;
        task_lock(tsk);
-        tsk->flags |= PF_BORROWED_MM;
        active_mm = tsk->active_mm;
        atomic_inc(&mm->mm_count);
        tsk->mm = mm;
@@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm)
        struct task_struct *tsk = current;
        task_lock(tsk);
-        tsk->flags &= ~PF_BORROWED_MM;
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
diff --git a/fs/attr.c b/fs/attr.c
index 966b73e25f82..26c71ba1eed4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -51,7 +51,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
        }
        /* Check for setting the inode time. */
-        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
                if (!is_owner_or_cap(inode))
                        goto error;
        }
@@ -108,6 +108,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
        struct timespec now;
        unsigned int ia_valid = attr->ia_valid;
+        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                        return -EPERM;
+        }
        now = current_fs_time(inode->i_sb);
        attr->ia_ctime = now;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index bcfb2dc0a61b..2a41c2a7fc52 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = {
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
+        .llseek         = dcache_dir_lseek,
        .ioctl          = autofs4_root_ioctl,
 };
@@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = {
        .release        = dcache_dir_close,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
+        .llseek         = dcache_dir_lseek,
 };
 const struct inode_operations autofs4_indirect_root_inode_operations = {
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f1c2ea8342f5..5f1538c03b1b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -243,8 +243,7 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
        return -EIO;
 }
-static int bad_inode_permission(struct inode *inode, int mask,
+static int bad_inode_permission(struct inode *inode, int mask)
-                        struct nameidata *nd)
 {
        return -EIO;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab3..740f53672a8a 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
 static const struct file_operations befs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = befs_readdir,
+        .llseek         = generic_file_llseek,
 };
 static const struct inode_operations befs_dir_inode_operations = {
@@ -289,7 +290,7 @@ befs_destroy_inode(struct inode *inode)
        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 70f5d3a8eede..7109e451abf7 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -16,8 +16,9 @@ struct bfs_sb_info {
        unsigned long si_freei;
        unsigned long si_lf_eblk;
        unsigned long si_lasti;
-        unsigned long * si_imap;
+        unsigned long *si_imap;
-        struct buffer_head * si_sbh;            /* buffer header w/superblock */
+        struct buffer_head *si_sbh;             /* buffer header w/superblock */
+        struct mutex bfs_lock;
 };
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 034950cb3cbe..ed8feb052df9 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -32,16 +32,17 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
        struct inode *dir = f->f_path.dentry->d_inode;
        struct buffer_head *bh;
        struct bfs_dirent *de;
+        struct bfs_sb_info *info = BFS_SB(dir->i_sb);
        unsigned int offset;
        int block;
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
                printf("Bad f_pos=%08lx for %s:%08lx\n",
                                        (unsigned long)f->f_pos,
                                        dir->i_sb->s_id, dir->i_ino);
-                unlock_kernel();
+                mutex_unlock(&info->bfs_lock);
                return -EBADF;
        }
@@ -61,7 +62,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
                                                le16_to_cpu(de->ino),
                                                DT_UNKNOWN) < 0) {
                                        brelse(bh);
-                                        unlock_kernel();
+                                        mutex_unlock(&info->bfs_lock);
                                        return 0;
                                }
                        }
@@ -71,7 +72,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
                brelse(bh);
        }
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        return 0;       
 }
@@ -95,10 +96,10 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        inode = new_inode(s);
        if (!inode)
                return -ENOSPC;
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        ino = find_first_zero_bit(info->si_imap, info->si_lasti);
        if (ino > info->si_lasti) {
-                unlock_kernel();
+                mutex_unlock(&info->bfs_lock);
                iput(inode);
                return -ENOSPC;
        }
@@ -124,11 +125,11 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
                                                        inode->i_ino);
        if (err) {
                inode_dec_link_count(inode);
+                mutex_unlock(&info->bfs_lock);
                iput(inode);
-                unlock_kernel();
                return err;
        }
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -139,22 +140,23 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        struct buffer_head *bh;
        struct bfs_dirent *de;
+        struct bfs_sb_info *info = BFS_SB(dir->i_sb);
        if (dentry->d_name.len > BFS_NAMELEN)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
        if (bh) {
                unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
                brelse(bh);
                inode = bfs_iget(dir->i_sb, ino);
                if (IS_ERR(inode)) {
-                        unlock_kernel();
+                        mutex_unlock(&info->bfs_lock);
                        return ERR_CAST(inode);
                }
        }
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        d_add(dentry, inode);
        return NULL;
 }
@@ -163,13 +165,14 @@ static int bfs_link(struct dentry *old, struct inode *dir,
                                                struct dentry *new)
 {
        struct inode *inode = old->d_inode;
+        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
        int err;
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
                                                        inode->i_ino);
        if (err) {
-                unlock_kernel();
+                mutex_unlock(&info->bfs_lock);
                return err;
        }
        inc_nlink(inode);
@@ -177,19 +180,19 @@ static int bfs_link(struct dentry *old, struct inode *dir,
        mark_inode_dirty(inode);
        atomic_inc(&inode->i_count);
        d_instantiate(new, inode);
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        return 0;
 }
 static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 {
        int error = -ENOENT;
-        struct inode *inode;
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh;
        struct bfs_dirent *de;
+        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
-        inode = dentry->d_inode;
+        mutex_lock(&info->bfs_lock);
-        lock_kernel();
        bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
        if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
                goto out_brelse;
@@ -210,7 +213,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 out_brelse:
        brelse(bh);
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        return error;
 }
@@ -220,6 +223,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode, *new_inode;
        struct buffer_head *old_bh, *new_bh;
        struct bfs_dirent *old_de, *new_de;
+        struct bfs_sb_info *info;
        int error = -ENOENT;
        old_bh = new_bh = NULL;
@@ -227,7 +231,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode))
                return -EINVAL;
-        lock_kernel();
+        info = BFS_SB(old_inode->i_sb);
+        mutex_lock(&info->bfs_lock);
        old_bh = bfs_find_entry(old_dir, 
                                old_dentry->d_name.name, 
                                old_dentry->d_name.len, &old_de);
@@ -264,7 +270,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        error = 0;
 end_rename:
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        brelse(old_bh);
        brelse(new_bh);
        return error;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index b11e63e8fbcd..6a021265f018 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -99,7 +99,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
                return -ENOSPC;
        /* The rest has to be protected against itself. */
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        /*
         * If the last data block for this file is the last allocated
@@ -151,7 +151,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
        mark_buffer_dirty(sbh);
        map_bh(bh_result, sb, phys);
 out:
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        return err;
 }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8db623838b50..0ed57b5ee012 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -104,6 +104,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
        struct bfs_inode *di;
        struct buffer_head *bh;
        int block, off;
+        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
        dprintf("ino=%08x\n", ino);
@@ -112,13 +113,13 @@ static int bfs_write_inode(struct inode *inode, int unused)
                return -EIO;
        }
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
        bh = sb_bread(inode->i_sb, block);
        if (!bh) {
                printf("Unable to read inode %s:%08x\n",
                                inode->i_sb->s_id, ino);
-                unlock_kernel();
+                mutex_unlock(&info->bfs_lock);
                return -EIO;
        }
@@ -145,7 +146,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
        mark_buffer_dirty(bh);
        brelse(bh);
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        return 0;
 }
@@ -170,7 +171,7 @@ static void bfs_delete_inode(struct inode *inode)
        
        inode->i_size = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        lock_kernel();
+        mutex_lock(&info->bfs_lock);
        mark_inode_dirty(inode);
        block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
@@ -178,7 +179,7 @@ static void bfs_delete_inode(struct inode *inode)
        if (!bh) {
                printf("Unable to read inode %s:%08lx\n",
                                        inode->i_sb->s_id, ino);
-                unlock_kernel();
+                mutex_unlock(&info->bfs_lock);
                return;
        }
        off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
@@ -204,14 +205,16 @@ static void bfs_delete_inode(struct inode *inode)
                info->si_lf_eblk = bi->i_sblock - 1;
                mark_buffer_dirty(info->si_sbh);
        }
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
        clear_inode(inode);
 }
 static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
        brelse(info->si_sbh);
+        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
        s->s_fs_info = NULL;
@@ -236,11 +239,13 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static void bfs_write_super(struct super_block *s)
 {
-        lock_kernel();
+        struct bfs_sb_info *info = BFS_SB(s);
+        mutex_lock(&info->bfs_lock);
        if (!(s->s_flags & MS_RDONLY))
-                mark_buffer_dirty(BFS_SB(s)->si_sbh);
+                mark_buffer_dirty(info->si_sbh);
        s->s_dirt = 0;
-        unlock_kernel();
+        mutex_unlock(&info->bfs_lock);
 }
 static struct kmem_cache *bfs_inode_cachep;
@@ -259,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
        kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct bfs_inode_info *bi = foo;
@@ -380,7 +385,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                struct bfs_inode *di;
                int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
                int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-                unsigned long sblock, eblock;
+                unsigned long eblock;
                if (!off) {
                        brelse(bh);
@@ -399,7 +404,6 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                set_bit(i, info->si_imap);
                info->si_freeb -= BFS_FILEBLOCKS(di);
-                sblock =  le32_to_cpu(di->i_sblock);
                eblock =  le32_to_cpu(di->i_eblock);
                if (eblock > info->si_lf_eblk)
                        info->si_lf_eblk = eblock;
@@ -410,6 +414,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                s->s_dirt = 1;
        } 
        dump_imap("read_super", s);
+        mutex_init(&info->bfs_lock);
        return 0;
 out:
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ba4cddb92f1d..204cfd1d7676 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -444,12 +444,6 @@ beyond_if:
        regs->gp = ex.a_gpvalue;
 #endif
        start_thread(regs, ex.a_entry, current->mm->start_stack);
-        if (unlikely(current->ptrace & PT_PTRACED)) {
-                if (current->ptrace & PT_TRACE_EXEC)
-                        ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-                else
-                        send_sig(SIGTRAP, current, 0);
-        }
        return 0;
 }
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 639d2d8b5710..655ed8d30a86 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
 #endif
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
 static int
 create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                unsigned long load_addr, unsigned long interp_load_addr)
@@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        elf_addr_t __user *envp;
        elf_addr_t __user *sp;
        elf_addr_t __user *u_platform;
+        elf_addr_t __user *u_base_platform;
        const char *k_platform = ELF_PLATFORM;
+        const char *k_base_platform = ELF_BASE_PLATFORM;
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
@@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                        return -EFAULT;
        }
+        /*
+         * If this architecture has a "base" platform capability
+         * string, copy it to userspace.
+         */
+        u_base_platform = NULL;
+        if (k_base_platform) {
+                size_t len = strlen(k_base_platform) + 1;
+                u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+                if (__copy_to_user(u_base_platform, k_base_platform, len))
+                        return -EFAULT;
+        }
        /* Create the ELF interpreter info */
        elf_info = (elf_addr_t *)current->mm->saved_auxv;
        /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -209,6 +233,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                NEW_AUX_ENT(AT_PLATFORM,
                            (elf_addr_t)(unsigned long)u_platform);
        }
+        if (k_base_platform) {
+                NEW_AUX_ENT(AT_BASE_PLATFORM,
+                            (elf_addr_t)(unsigned long)u_base_platform);
+        }
        if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
                NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
        }
@@ -975,12 +1003,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 #endif
        start_thread(regs, elf_entry, bprm->p);
-        if (unlikely(current->ptrace & PT_PTRACED)) {
-                if (current->ptrace & PT_TRACE_EXEC)
-                        ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-                else
-                        send_sig(SIGTRAP, current, 0);
-        }
        retval = 0;
 out:
        kfree(loc);
@@ -1478,7 +1500,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        const struct user_regset_view *view = task_user_regset_view(dump_task);
        struct elf_thread_core_info *t;
        struct elf_prpsinfo *psinfo;
-        struct task_struct *g, *p;
+        struct core_thread *ct;
        unsigned int i;
        info->size = 0;
@@ -1517,31 +1539,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        /*
         * Allocate a structure for each thread.
         */
-        rcu_read_lock();
+        for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
-        do_each_thread(g, p)
+                t = kzalloc(offsetof(struct elf_thread_core_info,
-                if (p->mm == dump_task->mm) {
+                                     notes[info->thread_notes]),
-                        t = kzalloc(offsetof(struct elf_thread_core_info,
+                            GFP_KERNEL);
-                                             notes[info->thread_notes]),
+                if (unlikely(!t))
-                                    GFP_ATOMIC);
+                        return 0;
-                        if (unlikely(!t)) {
-                                rcu_read_unlock();
+                t->task = ct->task;
-                                return 0;
+                if (ct->task == dump_task || !info->thread) {
-                        }
+                        t->next = info->thread;
-                        t->task = p;
+                        info->thread = t;
-                        if (p == dump_task || !info->thread) {
+                } else {
-                                t->next = info->thread;
+                        /*
-                                info->thread = t;
+                         * Make sure to keep the original task at
-                        } else {
+                         * the head of the list.
-                                /*
+                         */
-                                 * Make sure to keep the original task at
+                        t->next = info->thread->next;
-                                 * the head of the list.
+                        info->thread->next = t;
-                                 */
-                                t->next = info->thread->next;
-                                info->thread->next = t;
-                        }
                }
-        while_each_thread(g, p);
+        }
-        rcu_read_unlock();
        /*
         * Now fill in each thread's information.
@@ -1688,7 +1705,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 {
 #define NUM_NOTES       6
        struct list_head *t;
-        struct task_struct *g, *p;
        info->notes = NULL;
        info->prstatus = NULL;
@@ -1720,20 +1736,19 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        info->thread_status_size = 0;
        if (signr) {
+                struct core_thread *ct;
                struct elf_thread_status *ets;
-                rcu_read_lock();
-                do_each_thread(g, p)
+                for (ct = current->mm->core_state->dumper.next;
-                        if (current->mm == p->mm && current != p) {
+                                                ct; ct = ct->next) {
-                                ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
+                        ets = kzalloc(sizeof(*ets), GFP_KERNEL);
-                                if (!ets) {
+                        if (!ets)
-                                        rcu_read_unlock();
+                                return 0;
-                                        return 0;
-                                }
+                        ets->thread = ct->task;
-                                ets->thread = p;
+                        list_add(&ets->list, &info->thread_list);
-                                list_add(&ets->list, &info->thread_list);
+                }
-                        }
-                while_each_thread(g, p);
-                rcu_read_unlock();
                list_for_each(t, &info->thread_list) {
                        int sz;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d051a32e6270..80c1f952ef78 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -433,13 +433,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
        start_thread(regs, entryaddr, current->mm->start_stack);
-        if (unlikely(current->ptrace & PT_PTRACED)) {
-                if (current->ptrace & PT_TRACE_EXEC)
-                        ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-                else
-                        send_sig(SIGTRAP, current, 0);
-        }
        retval = 0;
 error:
@@ -477,6 +470,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        char __user *u_platform, *p;
        long hwcap;
        int loop;
+        int nr; /* reset for each csp adjustment */
        /* we're going to shovel a whole load of stuff onto the stack */
 #ifdef CONFIG_MMU
@@ -549,10 +543,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        /* force 16 byte _final_ alignment here for generality */
 #define DLINFO_ITEMS 13
-        nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0);
+        nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
-#ifdef DLINFO_ARCH_ITEMS
-        nitems += DLINFO_ARCH_ITEMS;
-#endif
        csp = sp;
        sp -= nitems * 2 * sizeof(unsigned long);
@@ -564,39 +555,46 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        sp -= sp & 15UL;
        /* put the ELF interpreter info on the stack */
-#define NEW_AUX_ENT(nr, id, val)                                        \
+#define NEW_AUX_ENT(id, val)                                            \
        do {                                                            \
                struct { unsigned long _id, _val; } __user *ent;        \
                                                                        \
                ent = (void __user *) csp;                              \
                __put_user((id), &ent[nr]._id);                         \
                __put_user((val), &ent[nr]._val);                       \
+                nr++;                                                   \
        } while (0)
+        nr = 0;
        csp -= 2 * sizeof(unsigned long);
-        NEW_AUX_ENT(0, AT_NULL, 0);
+        NEW_AUX_ENT(AT_NULL, 0);
        if (k_platform) {
+                nr = 0;
                csp -= 2 * sizeof(unsigned long);
-                NEW_AUX_ENT(0, AT_PLATFORM,
+                NEW_AUX_ENT(AT_PLATFORM,
                            (elf_addr_t) (unsigned long) u_platform);
        }
+        nr = 0;
        csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
-        NEW_AUX_ENT( 0, AT_HWCAP,       hwcap);
+        NEW_AUX_ENT(AT_HWCAP,   hwcap);
-        NEW_AUX_ENT( 1, AT_PAGESZ,      PAGE_SIZE);
+        NEW_AUX_ENT(AT_PAGESZ,  PAGE_SIZE);
-        NEW_AUX_ENT( 2, AT_CLKTCK,      CLOCKS_PER_SEC);
+        NEW_AUX_ENT(AT_CLKTCK,  CLOCKS_PER_SEC);
-        NEW_AUX_ENT( 3, AT_PHDR,        exec_params->ph_addr);
+        NEW_AUX_ENT(AT_PHDR,    exec_params->ph_addr);
-        NEW_AUX_ENT( 4, AT_PHENT,       sizeof(struct elf_phdr));
+        NEW_AUX_ENT(AT_PHENT,   sizeof(struct elf_phdr));
-        NEW_AUX_ENT( 5, AT_PHNUM,       exec_params->hdr.e_phnum);
+        NEW_AUX_ENT(AT_PHNUM,   exec_params->hdr.e_phnum);
-        NEW_AUX_ENT( 6, AT_BASE,        interp_params->elfhdr_addr);
+        NEW_AUX_ENT(AT_BASE,    interp_params->elfhdr_addr);
-        NEW_AUX_ENT( 7, AT_FLAGS,       0);
+        NEW_AUX_ENT(AT_FLAGS,   0);
-        NEW_AUX_ENT( 8, AT_ENTRY,       exec_params->entry_addr);
+        NEW_AUX_ENT(AT_ENTRY,   exec_params->entry_addr);
-        NEW_AUX_ENT( 9, AT_UID,         (elf_addr_t) current->uid);
+        NEW_AUX_ENT(AT_UID,     (elf_addr_t) current->uid);
-        NEW_AUX_ENT(10, AT_EUID,        (elf_addr_t) current->euid);
+        NEW_AUX_ENT(AT_EUID,    (elf_addr_t) current->euid);
-        NEW_AUX_ENT(11, AT_GID,         (elf_addr_t) current->gid);
+        NEW_AUX_ENT(AT_GID,     (elf_addr_t) current->gid);
-        NEW_AUX_ENT(12, AT_EGID,        (elf_addr_t) current->egid);
+        NEW_AUX_ENT(AT_EGID,    (elf_addr_t) current->egid);
 #ifdef ARCH_DLINFO
+        nr = 0;
+        csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long);
        /* ARCH_DLINFO must come last so platform specific code can enforce
         * special alignment requirements on the AUXV if necessary (eg. PPC).
         */
@@ -1573,7 +1571,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        struct memelfnote *notes = NULL;
        struct elf_prstatus *prstatus = NULL;   /* NT_PRSTATUS */
        struct elf_prpsinfo *psinfo = NULL;     /* NT_PRPSINFO */
-        struct task_struct *g, *p;
        LIST_HEAD(thread_list);
        struct list_head *t;
        elf_fpregset_t *fpu = NULL;
@@ -1622,20 +1619,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 #endif
        if (signr) {
+                struct core_thread *ct;
                struct elf_thread_status *tmp;
-                rcu_read_lock();
-                do_each_thread(g,p)
+                for (ct = current->mm->core_state->dumper.next;
-                        if (current->mm == p->mm && current != p) {
+                                                ct; ct = ct->next) {
-                                tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
+                        tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
-                                if (!tmp) {
+                        if (!tmp)
-                                        rcu_read_unlock();
+                                goto cleanup;
-                                        goto cleanup;
-                                }
+                        tmp->thread = ct->task;
-                                tmp->thread = p;
+                        list_add(&tmp->list, &thread_list);
-                                list_add(&tmp->list, &thread_list);
+                }
-                        }
-                while_each_thread(g,p);
-                rcu_read_unlock();
                list_for_each(t, &thread_list) {
                        struct elf_thread_status *tmp;
                        int sz;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2cb1acda3a82..dfc0197905ca 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -914,15 +914,14 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        /* Stash our initial stack pointer into the mm structure */
        current->mm->start_stack = (unsigned long )sp;
-        
+#ifdef FLAT_PLAT_INIT
+        FLAT_PLAT_INIT(regs);
+#endif
        DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
                (int)regs, (int)start_addr, (int)current->mm->start_stack);
        
        start_thread(regs, start_addr, current->mm->start_stack);
-        if (current->ptrace & PT_PTRACED)
-                send_sig(SIGTRAP, current, 0);
        return 0;
 }
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 756205314c24..8d7e88e02e0f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (bprm->misc_bang)
                goto _ret;
-        bprm->misc_bang = 1;
        /* to keep locking time low, we copy the interpreter string */
        read_lock(&entries_lock);
        fmt = check_file(bprm);
@@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        if (retval < 0)
                goto _error;
+        bprm->misc_bang = 1;
        retval = search_binary_handler (bprm, regs);
        if (retval < 0)
                goto _error;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index fdc36bfd6a7b..68be580ba289 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -274,8 +274,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        map_hpux_gateway_page(current,current->mm);
        start_thread_som(regs, som_entry, bprm->p);
-        if (current->ptrace & PT_PTRACED)
-                send_sig(SIGTRAP, current, 0);
        return 0;
        /* error cleanup */
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 63e2ee63058d..c3e174b35fe6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -705,7 +705,6 @@ void __init bio_integrity_init_slab(void)
        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 }
-EXPORT_SYMBOL(bio_integrity_init_slab);
 static int __init integrity_init(void)
 {
diff --git a/fs/bio.c b/fs/bio.c
index 88322b066acb..3cba7ae34d75 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -77,11 +77,8 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
         */
        bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-        if (bvl) {
+        if (bvl)
-                struct biovec_slab *bp = bvec_slabs + *idx;
+                memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
-                memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
-        }
        return bvl;
 }
@@ -149,7 +146,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                                goto out;
                        }
                        bio->bi_flags |= idx << BIO_POOL_OFFSET;
-                        bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
+                        bio->bi_max_vecs = bvec_nr_vecs(idx);
                }
                bio->bi_io_vec = bvl;
        }
@@ -472,20 +469,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)
        kfree(bmd);
 }
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+                                               gfp_t gfp_mask)
 {
-        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
+        struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
        if (!bmd)
                return NULL;
-        bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
+        bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
        if (!bmd->iovecs) {
                kfree(bmd);
                return NULL;
        }
-        bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+        bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
        if (bmd->sgvecs)
                return bmd;
@@ -494,8 +492,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
        return NULL;
 }
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-                          int uncopy)
+                          struct sg_iovec *iov, int iov_count, int uncopy)
 {
        int ret = 0, i;
        struct bio_vec *bvec;
@@ -505,7 +503,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
        __bio_for_each_segment(bvec, bio, i, 0) {
                char *bv_addr = page_address(bvec->bv_page);
-                unsigned int bv_len = bvec->bv_len;
+                unsigned int bv_len = iovecs[i].bv_len;
                while (bv_len && iov_idx < iov_count) {
                        unsigned int bytes;
@@ -557,7 +555,7 @@ int bio_uncopy_user(struct bio *bio)
        struct bio_map_data *bmd = bio->bi_private;
        int ret;
-        ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
+        ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);
        bio_free_map_data(bmd);
        bio_put(bio);
@@ -599,7 +597,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
                len += iov[i].iov_len;
        }
-        bmd = bio_alloc_map_data(nr_pages, iov_count);
+        bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL);
        if (!bmd)
                return ERR_PTR(-ENOMEM);
@@ -636,7 +634,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
         * success
         */
        if (!write_to_vm) {
-                ret = __bio_copy_iov(bio, iov, iov_count, 0);
+                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0);
                if (ret)
                        goto cleanup;
        }
@@ -721,12 +719,8 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
                const int local_nr_pages = end - start;
                const int page_limit = cur_page + local_nr_pages;
                
-                down_read(&current->mm->mmap_sem);
+                ret = get_user_pages_fast(uaddr, local_nr_pages,
-                ret = get_user_pages(current, current->mm, uaddr,
+                                write_to_vm, &pages[cur_page]);
-                                     local_nr_pages,
-                                     write_to_vm, 0, &pages[cur_page], NULL);
-                up_read(&current->mm->mmap_sem);
                if (ret < local_nr_pages) {
                        ret = -EFAULT;
                        goto out_unmap;
@@ -949,19 +943,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 {
        struct bio_vec *bvec;
        const int read = bio_data_dir(bio) == READ;
-        char *p = bio->bi_private;
+        struct bio_map_data *bmd = bio->bi_private;
        int i;
+        char *p = bmd->sgvecs[0].iov_base;
        __bio_for_each_segment(bvec, bio, i, 0) {
                char *addr = page_address(bvec->bv_page);
+                int len = bmd->iovecs[i].bv_len;
                if (read && !err)
-                        memcpy(p, addr, bvec->bv_len);
+                        memcpy(p, addr, len);
                __free_page(bvec->bv_page);
-                p += bvec->bv_len;
+                p += len;
        }
+        bio_free_map_data(bmd);
        bio_put(bio);
 }
@@ -985,11 +982,21 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
        const int nr_pages = end - start;
        struct bio *bio;
        struct bio_vec *bvec;
+        struct bio_map_data *bmd;
        int i, ret;
+        struct sg_iovec iov;
+        iov.iov_base = data;
+        iov.iov_len = len;
+        bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask);
+        if (!bmd)
+                return ERR_PTR(-ENOMEM);
+        ret = -ENOMEM;
        bio = bio_alloc(gfp_mask, nr_pages);
        if (!bio)
-                return ERR_PTR(-ENOMEM);
+                goto out_bmd;
        while (len) {
                struct page *page;
@@ -1023,14 +1030,18 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
                }
        }
-        bio->bi_private = data;
+        bio->bi_private = bmd;
        bio->bi_end_io = bio_copy_kern_endio;
+        bio_set_map_data(bmd, bio, &iov, 1);
        return bio;
 cleanup:
        bio_for_each_segment(bvec, bio, i)
                __free_page(bvec->bv_page);
        bio_put(bio);
+out_bmd:
+        bio_free_map_data(bmd);
        return ERR_PTR(ret);
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10d8a0aa871a..aff54219e049 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
        kmem_cache_free(bdev_cachep, bdi);
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct bdev_inode *ei = (struct bdev_inode *) foo;
        struct block_device *bdev = &ei->bdev;
@@ -941,8 +941,10 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
         * hooks: /n/, see "layering violations".
         */
        ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-        if (ret != 0)
+        if (ret != 0) {
+                bdput(bdev);
                return ret;
+        }
        ret = -ENXIO;
        file->f_mapping = bdev->bd_inode->i_mapping;
@@ -1234,6 +1236,7 @@ fail:
        bdev = ERR_PTR(error);
        goto out;
 }
+EXPORT_SYMBOL(lookup_bdev);
 /**
 * open_bdev_excl  -  open a block device by name and set it up for use
diff --git a/fs/buffer.c b/fs/buffer.c
index d48caee12e2a..ac78d4c19b3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -580,7 +580,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
 /*
 * The buffer's backing address_space's private_lock must be held
 */
-static inline void __remove_assoc_queue(struct buffer_head *bh)
+static void __remove_assoc_queue(struct buffer_head *bh)
 {
        list_del_init(&bh->b_assoc_buffers);
        WARN_ON(!bh->b_assoc_map);
@@ -706,7 +706,7 @@ static int __set_page_dirty(struct page *page,
        if (TestSetPageDirty(page))
                return 0;
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
@@ -719,7 +719,7 @@ static int __set_page_dirty(struct page *page,
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        return 1;
@@ -1214,8 +1214,7 @@ void __brelse(struct buffer_head * buf)
                put_bh(buf);
                return;
        }
-        printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
+        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
-        WARN_ON(1);
 }
 /*
@@ -1721,7 +1720,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                 */
                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
                        lock_buffer(bh);
-                } else if (test_set_buffer_locked(bh)) {
+                } else if (!trylock_buffer(bh)) {
                        redirty_page_for_writepage(wbc, page);
                        continue;
                }
@@ -2097,6 +2096,52 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(generic_write_end);
 /*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+                                        unsigned long from)
+{
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end, blocksize;
+        unsigned to;
+        struct buffer_head *bh, *head;
+        int ret = 1;
+        if (!page_has_buffers(page))
+                return 0;
+        blocksize = 1 << inode->i_blkbits;
+        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+        to = from + to;
+        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+                return 0;
+        head = page_buffers(page);
+        bh = head;
+        block_start = 0;
+        do {
+                block_end = block_start + blocksize;
+                if (block_end > from && block_start < to) {
+                        if (!buffer_uptodate(bh)) {
+                                ret = 0;
+                                break;
+                        }
+                        if (block_end >= to)
+                                break;
+                }
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
@@ -2881,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
-        if (buffer_ordered(bh) && (rw == WRITE))
+        /*
-                rw = WRITE_BARRIER;
+         * Mask in barrier bit for a write (could be either a WRITE or a
+         * WRITE_SYNC
+         */
+        if (buffer_ordered(bh) && (rw & WRITE))
+                rw |= WRITE_BARRIER;
        /*
-         * Only clear out a write error when rewriting, should this
+         * Only clear out a write error when rewriting
-         * include WRITE_SYNC as well?
         */
-        if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
+        if (test_set_buffer_req(bh) && (rw & WRITE))
                clear_buffer_write_io_error(bh);
        /*
@@ -2955,7 +3003,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
                if (rw == SWRITE || rw == SWRITE_SYNC)
                        lock_buffer(bh);
-                else if (test_set_buffer_locked(bh))
+                else if (!trylock_buffer(bh))
                        continue;
                if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
@@ -3272,7 +3320,7 @@ int bh_submit_read(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_submit_read);
 static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
 {
        struct buffer_head *bh = data;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1f3465201fdf..06e521a945c3 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,19 @@
+Version 1.54
+------------
+Fix premature write failure on congested networks (we would give up
+on EAGAIN from the socket too quickly on large writes).
+Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
+Fix endian problems in acl (mode from/to cifs acl) on bigendian
+architectures.  Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba).  Fix memory leak
+on dns_upcall (resolving DFS referralls).  Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though).  Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
 Version 1.53
 ------------
 DFS support added (Microsoft Distributed File System client support needed
diff --git a/fs/cifs/README b/fs/cifs/README
index 2bd6fe556f88..bd2343d4c6a6 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -542,10 +542,20 @@ SecurityFlags		Flags which control security negotiation and
                        hashing mechanisms (as "must use") on the other hand 
                        does not make much sense. Default flags are 
                                0x07007 
-                        (NTLM, NTLMv2 and packet signing allowed).  Maximum 
+                        (NTLM, NTLMv2 and packet signing allowed).  The maximum 
                        allowable flags if you want to allow mounts to servers
                        using weaker password hashes is 0x37037 (lanman,
-                        plaintext, ntlm, ntlmv2, signing allowed):
+                        plaintext, ntlm, ntlmv2, signing allowed).  Some
+                        SecurityFlags require the corresponding menuconfig
+                        options to be enabled (lanman and plaintext require
+                        CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
+                        plaintext authentication currently requires also
+                        enabling lanman authentication in the security flags
+                        because the cifs module only supports sending
+                        laintext passwords using the older lanman dialect
+                        form of the session setup SMB.  (e.g. for authentication
+                        using plain text passwords, set the SecurityFlags
+                        to 0x30030):
 
                        may use packet signing                          0x00001
                        must use packet signing                         0x01001
@@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in
 that they represent all for that share, not just those for which the server
 returned success.
        
-Also note that "cat /proc/fs/cifs/DebugData" will display information about 
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
 the active sessions and the shares that are mounted.
-Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
-on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
+Enabling Kerberos (extended security) works but requires version 1.2 or later
-LANMAN support do not require this helper.
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+Enabling DFS support (used to access shares transparently in an MS-DFS
+global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In
+addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index f58e41d3ba48..1b09f1670061 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -400,7 +400,7 @@ asn1_oid_decode(struct asn1_ctx *ctx,
        size = eoc - ctx->pointer + 1;
        /* first subid actually encodes first two subids */
-        if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+        if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
                return 0;
        *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
@@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        unsigned int cls, con, tag, oidlen, rc;
        bool use_ntlmssp = false;
        bool use_kerberos = false;
+        bool use_mskerberos = false;
        *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
@@ -483,6 +484,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
        asn1_open(&ctx, security_blob, length);
+        /* GSSAPI header */
        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
                cFYI(1, ("Error decoding negTokenInit header"));
                return 0;
@@ -490,156 +492,149 @@ decode_negTokenInit(unsigned char *security_blob, int length,
                   || (tag != ASN1_EOC)) {
                cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
                return 0;
-        } else {
+        }
-                /*      remember to free obj->oid */
-                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-                if (rc) {
-                        if ((tag == ASN1_OJI) && (cls == ASN1_PRI)) {
-                                rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
-                                if (rc) {
-                                        rc = compare_oid(oid, oidlen,
-                                                         SPNEGO_OID,
-                                                         SPNEGO_OID_LEN);
-                                        kfree(oid);
-                                }
-                        } else
-                                rc = 0;
-                }
-                if (!rc) {
+        /* Check for SPNEGO OID -- remember to free obj->oid */
-                        cFYI(1, ("Error decoding negTokenInit header"));
+        rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-                        return 0;
+        if (rc) {
-                }
+                if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
+                    (cls == ASN1_UNI)) {
+                        rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
+                        if (rc) {
+                                rc = compare_oid(oid, oidlen, SPNEGO_OID,
+                                                 SPNEGO_OID_LEN);
+                                kfree(oid);
+                        }
+                } else
+                        rc = 0;
+        }
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+        /* SPNEGO OID not present or garbled -- bail out */
-                        cFYI(1, ("Error decoding negTokenInit"));
+        if (!rc) {
-                        return 0;
+                cFYI(1, ("Error decoding negTokenInit header"));
-                } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+                return 0;
-                           || (tag != ASN1_EOC)) {
+        }
-                        cFYI(1,
-                             ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                              cls, con, tag, end, *end));
-                        return 0;
-                }
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        cFYI(1, ("Error decoding negTokenInit"));
+                cFYI(1, ("Error decoding negTokenInit"));
-                        return 0;
+                return 0;
-                } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
-                           || (tag != ASN1_SEQ)) {
+                   || (tag != ASN1_EOC)) {
-                        cFYI(1,
+                cFYI(1,
-                             ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                              cls, con, tag, end, *end));
+                      cls, con, tag, end, *end));
-                        return 0;
+                return 0;
-                }
+        }
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                cFYI(1, ("Error decoding negTokenInit"));
-                        return 0;
+                return 0;
-                } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-                           || (tag != ASN1_EOC)) {
+                   || (tag != ASN1_SEQ)) {
-                        cFYI(1,
+                cFYI(1,
-                             ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                              cls, con, tag, end, *end));
+                      cls, con, tag, end, *end));
-                        return 0;
+                return 0;
-                }
+        }
-                if (asn1_header_decode
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
+                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
-                        cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+                return 0;
-                        return 0;
+        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
-                } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+                   || (tag != ASN1_EOC)) {
-                           || (tag != ASN1_SEQ)) {
+                cFYI(1,
-                        cFYI(1,
+                     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-                             ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+                      cls, con, tag, end, *end));
-                              cls, con, tag, end, *end));
+                return 0;
-                        return 0;
+        }
-                }
-                while (!asn1_eoc_decode(&ctx, sequence_end)) {
+        if (asn1_header_decode
-                        rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+            (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-                        if (!rc) {
+                cFYI(1, ("Error decoding 2nd part of negTokenInit"));
-                                cFYI(1,
+                return 0;
-                                     ("Error decoding negTokenInit hdr exit2"));
+        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-                                return 0;
+                   || (tag != ASN1_SEQ)) {
-                        }
+                cFYI(1,
-                        if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
+                     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-                                if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
+                      cls, con, tag, end, *end));
+                return 0;
-                                        cFYI(1,
+        }
-                                          ("OID len = %d oid = 0x%lx 0x%lx "
-                                           "0x%lx 0x%lx",
-                                           oidlen, *oid, *(oid + 1),
-                                           *(oid + 2), *(oid + 3)));
-                                        if (compare_oid(oid, oidlen,
-                                                        MSKRB5_OID,
-                                                        MSKRB5_OID_LEN))
-                                                use_kerberos = true;
-                                        else if (compare_oid(oid, oidlen,
-                                                             KRB5_OID,
-                                                             KRB5_OID_LEN))
-                                                use_kerberos = true;
-                                        else if (compare_oid(oid, oidlen,
-                                                             NTLMSSP_OID,
-                                                             NTLMSSP_OID_LEN))
-                                                use_ntlmssp = true;
-                                        kfree(oid);
-                                }
-                        } else {
-                                cFYI(1, ("Should be an oid what is going on?"));
-                        }
-                }
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+        while (!asn1_eoc_decode(&ctx, sequence_end)) {
-                        cFYI(1,
+                rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-                             ("Error decoding last part negTokenInit exit3"));
+                if (!rc) {
-                        return 0;
-                } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-                        /* tag = 3 indicating mechListMIC */
                        cFYI(1,
-                             ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+                             ("Error decoding negTokenInit hdr exit2"));
-                              cls, con, tag, end, *end));
                        return 0;
                }
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+                if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
-                        cFYI(1,
+                        if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-                             ("Error decoding last part negTokenInit exit5"));
-                        return 0;
+                                cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
-                } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+                                         "0x%lx 0x%lx", oidlen, *oid,
-                           || (tag != ASN1_SEQ)) {
+                                         *(oid + 1), *(oid + 2), *(oid + 3)));
-                        cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
-                                cls, con, tag, end, *end));
+                                if (compare_oid(oid, oidlen, MSKRB5_OID,
+                                                MSKRB5_OID_LEN) &&
+                                                !use_kerberos)
+                                        use_mskerberos = true;
+                                else if (compare_oid(oid, oidlen, KRB5_OID,
+                                                     KRB5_OID_LEN) &&
+                                                     !use_mskerberos)
+                                        use_kerberos = true;
+                                else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+                                                     NTLMSSP_OID_LEN))
+                                        use_ntlmssp = true;
+                                kfree(oid);
+                        }
+                } else {
+                        cFYI(1, ("Should be an oid what is going on?"));
                }
+        }
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        cFYI(1,
+                cFYI(1, ("Error decoding last part negTokenInit exit3"));
-                             ("Error decoding last part negTokenInit exit 7"));
+                return 0;
-                        return 0;
+        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-                } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+                /* tag = 3 indicating mechListMIC */
-                        cFYI(1,
+                cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-                             ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+                         cls, con, tag, end, *end));
-                              cls, con, tag, end, *end));
+                return 0;
-                        return 0;
+        }
-                }
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+                cFYI(1, ("Error decoding last part negTokenInit exit5"));
-                        cFYI(1,
+                return 0;
-                             ("Error decoding last part negTokenInit exit9"));
+        } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-                        return 0;
+                   || (tag != ASN1_SEQ)) {
-                } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
+                cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
-                           || (tag != ASN1_GENSTR)) {
+                        cls, con, tag, end, *end));
-                        cFYI(1,
+        }
-                             ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-                              cls, con, tag, end, *end));
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-                        return 0;
+                cFYI(1, ("Error decoding last part negTokenInit exit 7"));
-                }
+                return 0;
-                cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+        } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-                         ctx.pointer)); /* is this UTF-8 or ASCII? */
+                cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+                         cls, con, tag, end, *end));
+                return 0;
+        }
+        if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+                cFYI(1, ("Error decoding last part negTokenInit exit9"));
+                return 0;
+        } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
+                   || (tag != ASN1_GENSTR)) {
+                cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+                         cls, con, tag, end, *end));
+                return 0;
        }
+        cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+                 ctx.pointer)); /* is this UTF-8 or ASCII? */
        if (use_kerberos)
                *secType = Kerberos;
+        else if (use_mskerberos)
+                *secType = MSKerberos;
        else if (use_ntlmssp)
                *secType = NTLMSSP;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index cc950f69e51e..69a12aae91d3 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,27 +79,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
        spin_lock(&GlobalMid_Lock);
        list_for_each(tmp, &server->pending_mid_q) {
                mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-                if (mid_entry) {
+                cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
-                        cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+                        mid_entry->midState,
-                                mid_entry->midState,
+                        (int)mid_entry->command,
-                                (int)mid_entry->command,
+                        mid_entry->pid,
-                                mid_entry->pid,
+                        mid_entry->tsk,
-                                mid_entry->tsk,
+                        mid_entry->mid));
-                                mid_entry->mid));
 #ifdef CONFIG_CIFS_STATS2
-                        cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+                cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
-                                mid_entry->largeBuf,
+                        mid_entry->largeBuf,
-                                mid_entry->resp_buf,
+                        mid_entry->resp_buf,
-                                mid_entry->when_received,
+                        mid_entry->when_received,
-                                jiffies));
+                        jiffies));
 #endif /* STATS2 */
-                        cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+                cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-                                  mid_entry->multiEnd));
+                          mid_entry->multiEnd));
-                        if (mid_entry->resp_buf) {
+                if (mid_entry->resp_buf) {
-                                cifs_dump_detail(mid_entry->resp_buf);
+                        cifs_dump_detail(mid_entry->resp_buf);
-                                cifs_dump_mem("existing buf: ",
+                        cifs_dump_mem("existing buf: ",
-                                        mid_entry->resp_buf, 62);
+                                mid_entry->resp_buf, 62);
-                        }
                }
        }
        spin_unlock(&GlobalMid_Lock);
@@ -107,9 +105,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #endif /* CONFIG_CIFS_DEBUG2 */
 #ifdef CONFIG_PROC_FS
-static int
+static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
-cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
-                     int count, int *eof, void *data)
 {
        struct list_head *tmp;
        struct list_head *tmp1;
@@ -117,23 +113,13 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
        struct cifsSesInfo *ses;
        struct cifsTconInfo *tcon;
        int i;
-        int length = 0;
-        char *original_buf = buf;
-        *beginBuffer = buf + offset;
-        length =
+        seq_puts(m,
-            sprintf(buf,
                    "Display Internal CIFS Data Structures for Debugging\n"
                    "---------------------------------------------------\n");
-        buf += length;
+        seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
-        length = sprintf(buf, "CIFS Version %s\n", CIFS_VERSION);
+        seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
-        buf += length;
+        seq_printf(m, "Servers:");
-        length = sprintf(buf,
-                "Active VFS Requests: %d\n", GlobalTotalActiveXid);
-        buf += length;
-        length = sprintf(buf, "Servers:");
-        buf += length;
        i = 0;
        read_lock(&GlobalSMBSeslock);
@@ -142,11 +128,10 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
                ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
                if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
                   (ses->serverNOS == NULL)) {
-                        buf += sprintf(buf, "\nentry for %s not fully "
+                        seq_printf(m, "\nentry for %s not fully "
                                        "displayed\n\t", ses->serverName);
                } else {
-                        length =
+                        seq_printf(m,
-                            sprintf(buf,
                                    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
                                    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
                                    " session status: %d\t",
@@ -154,10 +139,9 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
                                atomic_read(&ses->inUse),
                                ses->serverOS, ses->serverNOS,
                                ses->capabilities, ses->status);
-                        buf += length;
                }
                if (ses->server) {
-                        buf += sprintf(buf, "TCP status: %d\n\tLocal Users To "
+                        seq_printf(m, "TCP status: %d\n\tLocal Users To "
                                    "Server: %d SecMode: 0x%x Req On Wire: %d",
                                ses->server->tcpStatus,
                                atomic_read(&ses->server->socketUseCount),
@@ -165,41 +149,34 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
                                atomic_read(&ses->server->inFlight));
 #ifdef CONFIG_CIFS_STATS2
-                        buf += sprintf(buf, " In Send: %d In MaxReq Wait: %d",
+                        seq_printf(m, " In Send: %d In MaxReq Wait: %d",
                                atomic_read(&ses->server->inSend),
                                atomic_read(&ses->server->num_waiters));
 #endif
-                        length = sprintf(buf, "\nMIDs:\n");
+                        seq_puts(m, "\nMIDs:\n");
-                        buf += length;
                        spin_lock(&GlobalMid_Lock);
                        list_for_each(tmp1, &ses->server->pending_mid_q) {
                                mid_entry = list_entry(tmp1, struct
                                        mid_q_entry,
                                        qhead);
-                                if (mid_entry) {
+                                seq_printf(m, "State: %d com: %d pid:"
-                                        length = sprintf(buf,
+                                                " %d tsk: %p mid %d\n",
-                                                        "State: %d com: %d pid:"
+                                                mid_entry->midState,
-                                                        " %d tsk: %p mid %d\n",
+                                                (int)mid_entry->command,
-                                                        mid_entry->midState,
+                                                mid_entry->pid,
-                                                        (int)mid_entry->command,
+                                                mid_entry->tsk,
-                                                        mid_entry->pid,
+                                                mid_entry->mid);
-                                                        mid_entry->tsk,
-                                                        mid_entry->mid);
-                                        buf += length;
-                                }
                        }
                        spin_unlock(&GlobalMid_Lock);
                }
        }
        read_unlock(&GlobalSMBSeslock);
-        sprintf(buf, "\n");
+        seq_putc(m, '\n');
-        buf++;
-        length = sprintf(buf, "Shares:");
+        seq_puts(m, "Shares:");
-        buf += length;
        i = 0;
        read_lock(&GlobalSMBSeslock);
@@ -208,62 +185,52 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
                i++;
                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
                dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-                length = sprintf(buf, "\n%d) %s Uses: %d ", i,
+                seq_printf(m, "\n%d) %s Uses: %d ", i,
                                 tcon->treeName, atomic_read(&tcon->useCount));
-                buf += length;
                if (tcon->nativeFileSystem) {
-                        length = sprintf(buf, "Type: %s ",
+                        seq_printf(m, "Type: %s ",
                                         tcon->nativeFileSystem);
-                        buf += length;
                }
-                length = sprintf(buf, "DevInfo: 0x%x Attributes: 0x%x"
+                seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
                                 "\nPathComponentMax: %d Status: %d",
                            le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
                            le32_to_cpu(tcon->fsAttrInfo.Attributes),
                            le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
                            tcon->tidStatus);
-                buf += length;
                if (dev_type == FILE_DEVICE_DISK)
-                        length = sprintf(buf, " type: DISK ");
+                        seq_puts(m, " type: DISK ");
                else if (dev_type == FILE_DEVICE_CD_ROM)
-                        length = sprintf(buf, " type: CDROM ");
+                        seq_puts(m, " type: CDROM ");
                else
-                        length =
+                        seq_printf(m, " type: %d ", dev_type);
-                            sprintf(buf, " type: %d ", dev_type);
-                buf += length;
+                if (tcon->tidStatus == CifsNeedReconnect)
-                if (tcon->tidStatus == CifsNeedReconnect) {
+                        seq_puts(m, "\tDISCONNECTED ");
-                        buf += sprintf(buf, "\tDISCONNECTED ");
-                        length += 14;
-                }
        }
        read_unlock(&GlobalSMBSeslock);
-        length = sprintf(buf, "\n");
+        seq_putc(m, '\n');
-        buf += length;
        /* BB add code to dump additional info such as TCP session info now */
-        /* Now calculate total size of returned data */
+        return 0;
-        length = buf - original_buf;
+}
-        if (offset + count >= length)
-                *eof = 1;
-        if (length < offset) {
-                *eof = 1;
-                return 0;
-        } else {
-                length = length - offset;
-        }
-        if (length > count)
-                length = count;
-        return length;
+static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_debug_data_proc_show, NULL);
 }
-#ifdef CONFIG_CIFS_STATS
+static const struct file_operations cifs_debug_data_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cifs_debug_data_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
-static int
+#ifdef CONFIG_CIFS_STATS
-cifs_stats_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_stats_proc_write(struct file *file,
-                 unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -307,236 +274,132 @@ cifs_stats_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
+static int cifs_stats_proc_show(struct seq_file *m, void *v)
-cifs_stats_read(char *buf, char **beginBuffer, off_t offset,
-                  int count, int *eof, void *data)
 {
-        int item_length, i, length;
+        int i;
        struct list_head *tmp;
        struct cifsTconInfo *tcon;
-        *beginBuffer = buf + offset;
+        seq_printf(m,
-        length = sprintf(buf,
                        "Resources in use\nCIFS Session: %d\n",
                        sesInfoAllocCount.counter);
-        buf += length;
+        seq_printf(m, "Share (unique mount targets): %d\n",
-        item_length =
-                sprintf(buf, "Share (unique mount targets): %d\n",
                        tconInfoAllocCount.counter);
-        length += item_length;
+        seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
-        buf += item_length;
-        item_length =
-                sprintf(buf, "SMB Request/Response Buffer: %d Pool size: %d\n",
                        bufAllocCount.counter,
                        cifs_min_rcv + tcpSesAllocCount.counter);
-        length += item_length;
+        seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
-        buf += item_length;
-        item_length =
-                sprintf(buf, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
                        smBufAllocCount.counter, cifs_min_small);
-        length += item_length;
-        buf += item_length;
 #ifdef CONFIG_CIFS_STATS2
-        item_length = sprintf(buf, "Total Large %d Small %d Allocations\n",
+        seq_printf(m, "Total Large %d Small %d Allocations\n",
                                atomic_read(&totBufAllocCount),
                                atomic_read(&totSmBufAllocCount));
-        length += item_length;
-        buf += item_length;
 #endif /* CONFIG_CIFS_STATS2 */
-        item_length =
+        seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
-                sprintf(buf, "Operations (MIDs): %d\n",
+        seq_printf(m,
-                        midCount.counter);
-        length += item_length;
-        buf += item_length;
-        item_length = sprintf(buf,
                "\n%d session %d share reconnects\n",
                tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
-        length += item_length;
-        buf += item_length;
-        item_length = sprintf(buf,
+        seq_printf(m,
                "Total vfs operations: %d maximum at one time: %d\n",
                GlobalCurrentXid, GlobalMaxActiveXid);
-        length += item_length;
-        buf += item_length;
        i = 0;
        read_lock(&GlobalSMBSeslock);
        list_for_each(tmp, &GlobalTreeConnectionList) {
                i++;
                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-                item_length = sprintf(buf, "\n%d) %s", i, tcon->treeName);
+                seq_printf(m, "\n%d) %s", i, tcon->treeName);
-                buf += item_length;
+                if (tcon->tidStatus == CifsNeedReconnect)
-                length += item_length;
+                        seq_puts(m, "\tDISCONNECTED ");
-                if (tcon->tidStatus == CifsNeedReconnect) {
+                seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
-                        buf += sprintf(buf, "\tDISCONNECTED ");
-                        length += 14;
-                }
-                item_length = sprintf(buf, "\nSMBs: %d Oplock Breaks: %d",
                        atomic_read(&tcon->num_smbs_sent),
                        atomic_read(&tcon->num_oplock_brks));
-                buf += item_length;
+                seq_printf(m, "\nReads:  %d Bytes: %lld",
-                length += item_length;
-                item_length = sprintf(buf, "\nReads:  %d Bytes: %lld",
                        atomic_read(&tcon->num_reads),
                        (long long)(tcon->bytes_read));
-                buf += item_length;
+                seq_printf(m, "\nWrites: %d Bytes: %lld",
-                length += item_length;
-                item_length = sprintf(buf, "\nWrites: %d Bytes: %lld",
                        atomic_read(&tcon->num_writes),
                        (long long)(tcon->bytes_written));
-                buf += item_length;
+                seq_printf(m,
-                length += item_length;
-                item_length = sprintf(buf,
                        "\nLocks: %d HardLinks: %d Symlinks: %d",
                        atomic_read(&tcon->num_locks),
                        atomic_read(&tcon->num_hardlinks),
                        atomic_read(&tcon->num_symlinks));
-                buf += item_length;
-                length += item_length;
-                item_length = sprintf(buf, "\nOpens: %d Closes: %d Deletes: %d",
+                seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
                        atomic_read(&tcon->num_opens),
                        atomic_read(&tcon->num_closes),
                        atomic_read(&tcon->num_deletes));
-                buf += item_length;
+                seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
-                length += item_length;
-                item_length = sprintf(buf, "\nMkdirs: %d Rmdirs: %d",
                        atomic_read(&tcon->num_mkdirs),
                        atomic_read(&tcon->num_rmdirs));
-                buf += item_length;
+                seq_printf(m, "\nRenames: %d T2 Renames %d",
-                length += item_length;
-                item_length = sprintf(buf, "\nRenames: %d T2 Renames %d",
                        atomic_read(&tcon->num_renames),
                        atomic_read(&tcon->num_t2renames));
-                buf += item_length;
+                seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
-                length += item_length;
-                item_length = sprintf(buf, "\nFindFirst: %d FNext %d FClose %d",
                        atomic_read(&tcon->num_ffirst),
                        atomic_read(&tcon->num_fnext),
                        atomic_read(&tcon->num_fclose));
-                buf += item_length;
-                length += item_length;
        }
        read_unlock(&GlobalSMBSeslock);
-        buf += sprintf(buf, "\n");
+        seq_putc(m, '\n');
-        length++;
+        return 0;
+}
-        if (offset + count >= length)
-                *eof = 1;
-        if (length < offset) {
-                *eof = 1;
-                return 0;
-        } else {
-                length = length - offset;
-        }
-        if (length > count)
-                length = count;
-        return length;
+static int cifs_stats_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_stats_proc_show, NULL);
 }
+static const struct file_operations cifs_stats_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cifs_stats_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = cifs_stats_proc_write,
+};
 #endif /* STATS */
 static struct proc_dir_entry *proc_fs_cifs;
-read_proc_t cifs_txanchor_read;
+static const struct file_operations cifsFYI_proc_fops;
-static read_proc_t cifsFYI_read;
+static const struct file_operations cifs_oplock_proc_fops;
-static write_proc_t cifsFYI_write;
+static const struct file_operations cifs_lookup_cache_proc_fops;
-static read_proc_t oplockEnabled_read;
+static const struct file_operations traceSMB_proc_fops;
-static write_proc_t oplockEnabled_write;
+static const struct file_operations cifs_multiuser_mount_proc_fops;
-static read_proc_t lookupFlag_read;
+static const struct file_operations cifs_security_flags_proc_fops;
-static write_proc_t lookupFlag_write;
+static const struct file_operations cifs_experimental_proc_fops;
-static read_proc_t traceSMB_read;
+static const struct file_operations cifs_linux_ext_proc_fops;
-static write_proc_t traceSMB_write;
-static read_proc_t multiuser_mount_read;
-static write_proc_t multiuser_mount_write;
-static read_proc_t security_flags_read;
-static write_proc_t security_flags_write;
-/* static read_proc_t ntlmv2_enabled_read;
-static write_proc_t ntlmv2_enabled_write;
-static read_proc_t packet_signing_enabled_read;
-static write_proc_t packet_signing_enabled_write;*/
-static read_proc_t experimEnabled_read;
-static write_proc_t experimEnabled_write;
-static read_proc_t linuxExtensionsEnabled_read;
-static write_proc_t linuxExtensionsEnabled_write;
 void
 cifs_proc_init(void)
 {
-        struct proc_dir_entry *pde;
        proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
        if (proc_fs_cifs == NULL)
                return;
        proc_fs_cifs->owner = THIS_MODULE;
-        create_proc_read_entry("DebugData", 0, proc_fs_cifs,
+        proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
-                                cifs_debug_data_read, NULL);
 #ifdef CONFIG_CIFS_STATS
-        pde = create_proc_read_entry("Stats", 0, proc_fs_cifs,
+        proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
-                                cifs_stats_read, NULL);
-        if (pde)
-                pde->write_proc = cifs_stats_write;
 #endif /* STATS */
-        pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs,
+        proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
-                                cifsFYI_read, NULL);
+        proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
-        if (pde)
+        proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
-                pde->write_proc = cifsFYI_write;
+        proc_create("Experimental", 0, proc_fs_cifs,
+                    &cifs_experimental_proc_fops);
-        pde =
+        proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
-            create_proc_read_entry("traceSMB", 0, proc_fs_cifs,
+                    &cifs_linux_ext_proc_fops);
-                                traceSMB_read, NULL);
+        proc_create("MultiuserMount", 0, proc_fs_cifs,
-        if (pde)
+                    &cifs_multiuser_mount_proc_fops);
-                pde->write_proc = traceSMB_write;
+        proc_create("SecurityFlags", 0, proc_fs_cifs,
+                    &cifs_security_flags_proc_fops);
-        pde = create_proc_read_entry("OplockEnabled", 0, proc_fs_cifs,
+        proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
-                                oplockEnabled_read, NULL);
+                    &cifs_lookup_cache_proc_fops);
-        if (pde)
-                pde->write_proc = oplockEnabled_write;
-        pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs,
-                                experimEnabled_read, NULL);
-        if (pde)
-                pde->write_proc = experimEnabled_write;
-        pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs,
-                                linuxExtensionsEnabled_read, NULL);
-        if (pde)
-                pde->write_proc = linuxExtensionsEnabled_write;
-        pde =
-            create_proc_read_entry("MultiuserMount", 0, proc_fs_cifs,
-                                multiuser_mount_read, NULL);
-        if (pde)
-                pde->write_proc = multiuser_mount_write;
-        pde =
-            create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
-                                security_flags_read, NULL);
-        if (pde)
-                pde->write_proc = security_flags_write;
-        pde =
-        create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs,
-                                lookupFlag_read, NULL);
-        if (pde)
-                pde->write_proc = lookupFlag_write;
-/*      pde =
-            create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs,
-                                ntlmv2_enabled_read, NULL);
-        if (pde)
-                pde->write_proc = ntlmv2_enabled_write;
-        pde =
-            create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs,
-                                packet_signing_enabled_read, NULL);
-        if (pde)
-                pde->write_proc = packet_signing_enabled_write;*/
 }
 void
@@ -553,39 +416,26 @@ cifs_proc_clean(void)
 #endif
        remove_proc_entry("MultiuserMount", proc_fs_cifs);
        remove_proc_entry("OplockEnabled", proc_fs_cifs);
-/*      remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
        remove_proc_entry("SecurityFlags", proc_fs_cifs);
-/*      remove_proc_entry("PacketSigningEnabled", proc_fs_cifs); */
        remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
        remove_proc_entry("Experimental", proc_fs_cifs);
        remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
        remove_proc_entry("fs/cifs", NULL);
 }
-static int
+static int cifsFYI_proc_show(struct seq_file *m, void *v)
-cifsFYI_read(char *page, char **start, off_t off, int count,
-             int *eof, void *data)
 {
-        int len;
+        seq_printf(m, "%d\n", cifsFYI);
+        return 0;
-        len = sprintf(page, "%d\n", cifsFYI);
+}
-        len -= off;
-        *start = page + off;
-        if (len > count)
-                len = count;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifsFYI_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifsFYI_proc_show, NULL);
 }
-static int
-cifsFYI_write(struct file *file, const char __user *buffer,
+static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
-              unsigned long count, void *data)
+                size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -603,30 +453,28 @@ cifsFYI_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
+static const struct file_operations cifsFYI_proc_fops = {
-oplockEnabled_read(char *page, char **start, off_t off,
+        .owner          = THIS_MODULE,
-                   int count, int *eof, void *data)
+        .open           = cifsFYI_proc_open,
-{
+        .read           = seq_read,
-        int len;
+        .llseek         = seq_lseek,
+        .release        = single_release,
-        len = sprintf(page, "%d\n", oplockEnabled);
+        .write          = cifsFYI_proc_write,
+};
-        len -= off;
-        *start = page + off;
-        if (len > count)
+static int cifs_oplock_proc_show(struct seq_file *m, void *v)
-                len = count;
+{
-        else
+        seq_printf(m, "%d\n", oplockEnabled);
-                *eof = 1;
+        return 0;
+}
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_oplock_proc_show, NULL);
 }
-static int
-oplockEnabled_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_oplock_proc_write(struct file *file,
-                    unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -642,30 +490,28 @@ oplockEnabled_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
+static const struct file_operations cifs_oplock_proc_fops = {
-experimEnabled_read(char *page, char **start, off_t off,
+        .owner          = THIS_MODULE,
-                    int count, int *eof, void *data)
+        .open           = cifs_oplock_proc_open,
-{
+        .read           = seq_read,
-        int len;
+        .llseek         = seq_lseek,
+        .release        = single_release,
-        len = sprintf(page, "%d\n", experimEnabled);
+        .write          = cifs_oplock_proc_write,
+};
-        len -= off;
+static int cifs_experimental_proc_show(struct seq_file *m, void *v)
-        *start = page + off;
+{
+        seq_printf(m, "%d\n", experimEnabled);
-        if (len > count)
+        return 0;
-                len = count;
+}
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_experimental_proc_show, NULL);
 }
-static int
-experimEnabled_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_experimental_proc_write(struct file *file,
-                     unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -683,29 +529,28 @@ experimEnabled_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
+static const struct file_operations cifs_experimental_proc_fops = {
-linuxExtensionsEnabled_read(char *page, char **start, off_t off,
+        .owner          = THIS_MODULE,
-                            int count, int *eof, void *data)
+        .open           = cifs_experimental_proc_open,
-{
+        .read           = seq_read,
-        int len;
+        .llseek         = seq_lseek,
+        .release        = single_release,
-        len = sprintf(page, "%d\n", linuxExtEnabled);
+        .write          = cifs_experimental_proc_write,
-        len -= off;
+};
-        *start = page + off;
-        if (len > count)
+static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
-                len = count;
+{
-        else
+        seq_printf(m, "%d\n", linuxExtEnabled);
-                *eof = 1;
+        return 0;
+}
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_linux_ext_proc_show, NULL);
 }
-static int
-linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_linux_ext_proc_write(struct file *file,
-                             unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -721,31 +566,28 @@ linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
        return count;
 }
+static const struct file_operations cifs_linux_ext_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cifs_linux_ext_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = cifs_linux_ext_proc_write,
+};
-static int
+static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
-lookupFlag_read(char *page, char **start, off_t off,
-                int count, int *eof, void *data)
 {
-        int len;
+        seq_printf(m, "%d\n", lookupCacheEnabled);
+        return 0;
-        len = sprintf(page, "%d\n", lookupCacheEnabled);
+}
-        len -= off;
-        *start = page + off;
-        if (len > count)
-                len = count;
-        else
-                *eof = 1;
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_lookup_cache_proc_show, NULL);
 }
-static int
-lookupFlag_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_lookup_cache_proc_write(struct file *file,
-                    unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -760,30 +602,29 @@ lookupFlag_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
-traceSMB_read(char *page, char **start, off_t off, int count,
-              int *eof, void *data)
-{
-        int len;
-        len = sprintf(page, "%d\n", traceSMB);
-        len -= off;
-        *start = page + off;
-        if (len > count)
+static const struct file_operations cifs_lookup_cache_proc_fops = {
-                len = count;
+        .owner          = THIS_MODULE,
-        else
+        .open           = cifs_lookup_cache_proc_open,
-                *eof = 1;
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = cifs_lookup_cache_proc_write,
+};
-        if (len < 0)
+static int traceSMB_proc_show(struct seq_file *m, void *v)
-                len = 0;
+{
+        seq_printf(m, "%d\n", traceSMB);
+        return 0;
+}
-        return len;
+static int traceSMB_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, traceSMB_proc_show, NULL);
 }
-static int
-traceSMB_write(struct file *file, const char __user *buffer,
+static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
-               unsigned long count, void *data)
+                size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -799,30 +640,28 @@ traceSMB_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
+static const struct file_operations traceSMB_proc_fops = {
-multiuser_mount_read(char *page, char **start, off_t off,
+        .owner          = THIS_MODULE,
-                     int count, int *eof, void *data)
+        .open           = traceSMB_proc_open,
-{
+        .read           = seq_read,
-        int len;
+        .llseek         = seq_lseek,
+        .release        = single_release,
-        len = sprintf(page, "%d\n", multiuser_mount);
+        .write          = traceSMB_proc_write,
+};
-        len -= off;
-        *start = page + off;
-        if (len > count)
+static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
-                len = count;
+{
-        else
+        seq_printf(m, "%d\n", multiuser_mount);
-                *eof = 1;
+        return 0;
+}
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
+{
+        return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
 }
-static int
-multiuser_mount_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
-                      unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        char c;
        int rc;
@@ -838,30 +677,28 @@ multiuser_mount_write(struct file *file, const char __user *buffer,
        return count;
 }
-static int
+static const struct file_operations cifs_multiuser_mount_proc_fops = {
-security_flags_read(char *page, char **start, off_t off,
+        .owner          = THIS_MODULE,
-                       int count, int *eof, void *data)
+        .open           = cifs_multiuser_mount_proc_open,
-{
+        .read           = seq_read,
-        int len;
+        .llseek         = seq_lseek,
+        .release        = single_release,
-        len = sprintf(page, "0x%x\n", extended_security);
+        .write          = cifs_multiuser_mount_proc_write,
+};
-        len -= off;
-        *start = page + off;
-        if (len > count)
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
-                len = count;
+{
-        else
+        seq_printf(m, "0x%x\n", extended_security);
-                *eof = 1;
+        return 0;
+}
-        if (len < 0)
-                len = 0;
-        return len;
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cifs_security_flags_proc_show, NULL);
 }
-static int
-security_flags_write(struct file *file, const char __user *buffer,
+static ssize_t cifs_security_flags_proc_write(struct file *file,
-                        unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
        unsigned int flags;
        char flags_string[12];
@@ -917,6 +754,15 @@ security_flags_write(struct file *file, const char __user *buffer,
        /* BB should we turn on MAY flags for other MUST options? */
        return count;
 }
+static const struct file_operations cifs_security_flags_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = cifs_security_flags_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = cifs_security_flags_proc_write,
+};
 #else
 inline void cifs_proc_init(void)
 {
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d82374c9e329..d2c8eef84f3c 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,7 +226,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
        int err;
        mntget(newmnt);
-        err = do_add_mount(newmnt, nd, nd->path.mnt->mnt_flags, mntlist);
+        err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist);
        switch (err) {
        case 0:
                path_put(&nd->path);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7013aaff6aed..117ef4bba68e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,8 +66,8 @@ struct key_type cifs_spnego_key_type = {
        .describe       = user_describe,
 };
-#define MAX_VER_STR_LEN   9 /* length of longest version string e.g.
+#define MAX_VER_STR_LEN   8 /* length of longest version string e.g.
-                                strlen(";ver=0xFF") */
+                                strlen("ver=0xFF") */
 #define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
                               in future could have strlen(";sec=ntlmsspi") */
 #define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
@@ -81,11 +81,15 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        struct key *spnego_key;
        const char *hostname = server->hostname;
-        /* BB: come up with better scheme for determining length */
+        /* length of fields (with semicolons): ver=0xyz ip4=ipaddress
-        /* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host=
+           host=hostname sec=mechanism uid=0xFF user=username */
-           hostname sec=mechanism uid=0x uid */
+        desc_len = MAX_VER_STR_LEN +
-        desc_len = MAX_VER_STR_LEN + 5 + MAX_IPV6_ADDR_LEN + 1 + 6 +
+                   6 /* len of "host=" */ + strlen(hostname) +
-                  strlen(hostname) + MAX_MECH_STR_LEN + 8 + (sizeof(uid_t) * 2);
+                   5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN +
+                   MAX_MECH_STR_LEN +
+                   7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) +
+                   6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1;
        spnego_key = ERR_PTR(-ENOMEM);
        description = kzalloc(desc_len, GFP_KERNEL);
        if (description == NULL)
@@ -110,9 +114,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        dp = description + strlen(description);
-        /* for now, only sec=krb5 is valid */
+        /* for now, only sec=krb5 and sec=mskrb5 are valid */
        if (server->secType == Kerberos)
                sprintf(dp, ";sec=krb5");
+        else if (server->secType == MSKerberos)
+                sprintf(dp, ";sec=mskrb5");
        else
                goto out;
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 05a34b17a1ab..e4041ec4d712 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -23,7 +23,7 @@
 #ifndef _CIFS_SPNEGO_H
 #define _CIFS_SPNEGO_H
-#define CIFS_SPNEGO_UPCALL_VERSION 1
+#define CIFS_SPNEGO_UPCALL_VERSION 2
 /*
 * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 0e9fc2ba90ee..57ecdc83c26f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -56,7 +56,7 @@ int match_sid(struct cifs_sid *ctsid)
        struct cifs_sid *cwsid;
        if (!ctsid)
-                return (-1);
+                return -1;
        for (i = 0; i < NUM_WK_SIDS; ++i) {
                cwsid = &(wksidarr[i].cifssid);
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
                }
                cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
-                return (0); /* sids compare/match */
+                return 0; /* sids compare/match */
        }
        cFYI(1, ("No matching sid"));
-        return (-1);
+        return -1;
 }
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -102,16 +102,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
        int num_subauth, num_sat, num_saw;
        if ((!ctsid) || (!cwsid))
-                return (0);
+                return 0;
        /* compare the revision */
        if (ctsid->revision != cwsid->revision)
-                return (0);
+                return 0;
        /* compare all of the six auth values */
        for (i = 0; i < 6; ++i) {
                if (ctsid->authority[i] != cwsid->authority[i])
-                        return (0);
+                        return 0;
        }
        /* compare all of the subauth values if any */
@@ -121,11 +121,11 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
        if (num_subauth) {
                for (i = 0; i < num_subauth; ++i) {
                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
-                                return (0);
+                                return 0;
                }
        }
-        return (1); /* sids compare/match */
+        return 1; /* sids compare/match */
 }
@@ -169,8 +169,7 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
        for (i = 0; i < 6; i++)
                ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
        for (i = 0; i < 5; i++)
-                ngroup_sid_ptr->sub_auth[i] =
+                ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
-                                cpu_to_le32(group_sid_ptr->sub_auth[i]);
        return;
 }
@@ -285,7 +284,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
        size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
        pntace->size = cpu_to_le16(size);
-        return (size);
+        return size;
 }
@@ -426,7 +425,7 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
        pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
        pndacl->num_aces = cpu_to_le32(3);
-        return (0);
+        return 0;
 }
@@ -510,7 +509,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
                        sizeof(struct cifs_sid)); */
-        return (0);
+        return 0;
 }
@@ -527,7 +526,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
        if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
-                return (-EIO);
+                return -EIO;
        owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
                                le32_to_cpu(pntsd->osidoffset));
@@ -550,7 +549,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        /* copy security descriptor control portion and owner and group sid */
        copy_sec_desc(pntsd, pnntsd, sidsoffset);
-        return (rc);
+        return rc;
 }
@@ -629,11 +628,11 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
        if (!inode)
-                return (rc);
+                return rc;
        sb = inode->i_sb;
        if (sb == NULL)
-                return (rc);
+                return rc;
        cifs_sb = CIFS_SB(sb);
        xid = GetXid();
@@ -652,7 +651,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
                if (rc != 0) {
                        cERROR(1, ("Unable to open file to set ACL"));
                        FreeXid(xid);
-                        return (rc);
+                        return rc;
                }
        }
@@ -665,7 +664,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
        FreeXid(xid);
-        return (rc);
+        return rc;
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
@@ -715,7 +714,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
                if (!pnntsd) {
                        cERROR(1, ("Unable to allocate security descriptor"));
                        kfree(pntsd);
-                        return (-ENOMEM);
+                        return -ENOMEM;
                }
                rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
@@ -732,6 +731,6 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
                kfree(pntsd);
        }
-        return (rc);
+        return rc;
 }
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4ff8939c6cc7..bd5f13d38450 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
        if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
                if (extended_security & CIFSSEC_MAY_PLNTXT) {
+                        memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
                        memcpy(lnm_session_key, password_with_pad,
                                CIFS_ENCPWD_SIZE);
                        return;
@@ -310,9 +311,8 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
        utf8 and other multibyte codepages each need their own strupper
        function since a byte at a time will ont work. */
-        for (i = 0; i < CIFS_ENCPWD_SIZE; i++) {
+        for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
                password_with_pad[i] = toupper(password_with_pad[i]);
-        }
        SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
        /* clear password before we return/free memory */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22857c639df5..25ecbd5b0404 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -175,6 +175,8 @@ out_no_root:
        if (inode)
                iput(inode);
+        cifs_umount(sb, cifs_sb);
 out_mount_failed:
        if (cifs_sb) {
 #ifdef CONFIG_CIFS_DFS_UPCALL
@@ -267,7 +269,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static int cifs_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int cifs_permission(struct inode *inode, int mask)
 {
        struct cifs_sb_info *cifs_sb;
@@ -766,7 +768,7 @@ const struct file_operations cifs_dir_ops = {
 };
 static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
 {
        struct cifsInodeInfo *cifsi = inode;
@@ -930,36 +932,34 @@ static int cifs_oplock_thread(void *dummyarg)
                        schedule_timeout(39*HZ);
                } else {
                        oplock_item = list_entry(GlobalOplock_Q.next,
-                                struct oplock_q_entry, qhead);
+                                                struct oplock_q_entry, qhead);
-                        if (oplock_item) {
+                        cFYI(1, ("found oplock item to write out"));
-                                cFYI(1, ("found oplock item to write out"));
+                        pTcon = oplock_item->tcon;
-                                pTcon = oplock_item->tcon;
+                        inode = oplock_item->pinode;
-                                inode = oplock_item->pinode;
+                        netfid = oplock_item->netfid;
-                                netfid = oplock_item->netfid;
+                        spin_unlock(&GlobalMid_Lock);
-                                spin_unlock(&GlobalMid_Lock);
+                        DeleteOplockQEntry(oplock_item);
-                                DeleteOplockQEntry(oplock_item);
+                        /* can not grab inode sem here since it would
-                                /* can not grab inode sem here since it would
                                deadlock when oplock received on delete
                                since vfs_unlink holds the i_mutex across
                                the call */
-                                /* mutex_lock(&inode->i_mutex);*/
+                        /* mutex_lock(&inode->i_mutex);*/
-                                if (S_ISREG(inode->i_mode)) {
+                        if (S_ISREG(inode->i_mode)) {
-                                        rc =
+                                rc = filemap_fdatawrite(inode->i_mapping);
-                                           filemap_fdatawrite(inode->i_mapping);
+                                if (CIFS_I(inode)->clientCanCacheRead == 0) {
-                                        if (CIFS_I(inode)->clientCanCacheRead
+                                        waitrc = filemap_fdatawait(
-                                                                         == 0) {
+                                                              inode->i_mapping);
-                                                waitrc = filemap_fdatawait(inode->i_mapping);
+                                        invalidate_remote_inode(inode);
-                                                invalidate_remote_inode(inode);
+                                }
-                                        }
+                                if (rc == 0)
-                                        if (rc == 0)
+                                        rc = waitrc;
-                                                rc = waitrc;
+                        } else
-                                } else
+                                rc = 0;
-                                        rc = 0;
+                        /* mutex_unlock(&inode->i_mutex);*/
-                                /* mutex_unlock(&inode->i_mutex);*/
+                        if (rc)
-                                if (rc)
+                                CIFS_I(inode)->write_behind_rc = rc;
-                                        CIFS_I(inode)->write_behind_rc = rc;
+                        cFYI(1, ("Oplock flush inode %p rc %d",
-                                cFYI(1, ("Oplock flush inode %p rc %d",
+                                inode, rc));
-                                        inode, rc));
                                /* releasing stale oplock after recent reconnect
                                of smb session using a now incorrect file
@@ -967,15 +967,13 @@ static int cifs_oplock_thread(void *dummyarg)
                                not bother sending an oplock release if session
                                to server still is disconnected since oplock
                                already released by the server in that case */
-                                if (pTcon->tidStatus != CifsNeedReconnect) {
+                        if (pTcon->tidStatus != CifsNeedReconnect) {
-                                    rc = CIFSSMBLock(0, pTcon, netfid,
+                                rc = CIFSSMBLock(0, pTcon, netfid,
-                                            0 /* len */ , 0 /* offset */, 0,
+                                                0 /* len */ , 0 /* offset */, 0,
-                                            0, LOCKING_ANDX_OPLOCK_RELEASE,
+                                                0, LOCKING_ANDX_OPLOCK_RELEASE,
-                                            false /* wait flag */);
+                                                false /* wait flag */);
-                                        cFYI(1, ("Oplock release rc = %d", rc));
+                                cFYI(1, ("Oplock release rc = %d", rc));
-                                }
+                        }
-                        } else
-                                spin_unlock(&GlobalMid_Lock);
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(1);  /* yield in case q were corrupt */
                }
@@ -1001,8 +999,7 @@ static int cifs_dnotify_thread(void *dummyarg)
                list_for_each(tmp, &GlobalSMBSessionList) {
                        ses = list_entry(tmp, struct cifsSesInfo,
                                cifsSessionList);
-                        if (ses && ses->server &&
+                        if (ses->server && atomic_read(&ses->server->inFlight))
-                             atomic_read(&ses->server->inFlight))
                                wake_up_all(&ses->server->response_q);
                }
                read_unlock(&GlobalSMBSeslock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 25a6cbd15529..135c965c4137 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,5 +101,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.53"
+#define CIFS_VERSION   "1.54"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9cfcf326ead3..8dfd6f24d488 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -27,7 +27,7 @@
 #define MAX_SES_INFO 2
 #define MAX_TCON_INFO 4
-#define MAX_TREE_SIZE 2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1
+#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
 #define MAX_SERVER_SIZE 15
 #define MAX_SHARE_SIZE  64      /* used to be 20, this should still be enough */
 #define MAX_USERNAME_SIZE 32    /* 32 is to allow for 15 char names + null
@@ -80,7 +80,8 @@ enum securityEnum {
        NTLMv2,                 /* Legacy NTLM auth with NTLMv2 hash */
        RawNTLMSSP,             /* NTLMSSP without SPNEGO */
        NTLMSSP,                /* NTLMSSP via SPNEGO */
-        Kerberos                /* Kerberos via SPNEGO */
+        Kerberos,               /* Kerberos via SPNEGO */
+        MSKerberos,             /* MS Kerberos via SPNEGO */
 };
 enum protocolEnum {
@@ -537,8 +538,8 @@ require use of the stronger protocol */
 #endif /* WEAK_PW_HASH */
 #define   CIFSSEC_MUST_SEAL     0x40040 /* not supported yet */
-#define   CIFSSEC_DEF  CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
-#define   CIFSSEC_MAX  CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2
+#define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5)
 /*
 *****************************************************************
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 0f327c224da3..d2a073edd1b8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -31,7 +31,7 @@
 #else
 #define CIFS_PROT   0
 #endif
-#define POSIX_PROT  CIFS_PROT+1
+#define POSIX_PROT  (CIFS_PROT+1)
 #define BAD_PROT 0xFFFF
 /* SMB command codes */
@@ -262,7 +262,7 @@
 */
 #define CIFS_NO_HANDLE        0xFFFF
-#define NO_CHANGE_64          cpu_to_le64(0xFFFFFFFFFFFFFFFFULL)
+#define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL
 #define NO_CHANGE_32          0xFFFFFFFFUL
 /* IPC$ in ASCII */
@@ -341,7 +341,7 @@
 #define CREATE_COMPLETE_IF_OPLK 0x00000100      /* should be zero */
 #define CREATE_NO_EA_KNOWLEDGE  0x00000200
 #define CREATE_EIGHT_DOT_THREE  0x00000400      /* doc says this is obsolete
-                                                 "open for recovery" flag - should
+                                                 "open for recovery" flag should
                                                 be zero in any case */
 #define CREATE_OPEN_FOR_RECOVERY 0x00000400
 #define CREATE_RANDOM_ACCESS    0x00000800
@@ -414,8 +414,8 @@ struct smb_hdr {
        __u8 WordCount;
 } __attribute__((packed));
 /* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) ( *(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
-#define BCC_LE(smb_var) ( *(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
 #define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b9f5e935f821..a729d083e6f4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -172,12 +172,13 @@ extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon);
 extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
                        struct kstatfs *FSData);
-extern int CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
                        const char *fileName, const FILE_BASIC_INFO *data,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
-extern int CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
-                        const FILE_BASIC_INFO *data, __u16 fid);
+                        const FILE_BASIC_INFO *data, __u16 fid,
+                        __u32 pid_of_opener);
 #if 0
 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
                        char *fileName, __u16 dos_attributes,
@@ -191,9 +192,20 @@ extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
                         __u64 size, __u16 fileHandle, __u32 opener_pid,
                        bool AllocSizeFlag);
-extern int CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *pTcon,
-                        char *full_path, __u64 mode, __u64 uid,
+struct cifs_unix_set_info_args {
-                        __u64 gid, dev_t dev,
+        __u64   ctime;
+        __u64   atime;
+        __u64   mtime;
+        __u64   mode;
+        __u64   uid;
+        __u64   gid;
+        dev_t   device;
+};
+extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon,
+                        char *fileName,
+                        const struct cifs_unix_set_info_args *args,
                        const struct nls_table *nls_codepage,
                        int remap_special_chars);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4511b708f0f3..994de7c90474 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -128,8 +128,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
        write_lock(&GlobalSMBSeslock);
        list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
                open_file = list_entry(tmp, struct cifsFileInfo, tlist);
-                if (open_file)
+                open_file->invalidHandle = true;
-                        open_file->invalidHandle = true;
        }
        write_unlock(&GlobalSMBSeslock);
        /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
@@ -686,11 +685,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                                                 SecurityBlob,
                                                 count - 16,
                                                 &server->secType);
-                        if (rc == 1) {
+                        if (rc == 1)
                                rc = 0;
-                        } else {
+                        else
                                rc = -EINVAL;
-                        }
                }
        } else
                server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -3914,7 +3912,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
        bool is_unicode;
        struct dfs_referral_level_3 *ref;
-        is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE;
+        if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+                is_unicode = true;
+        else
+                is_unicode = false;
        *num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
        if (*num_of_nodes < 1) {
@@ -4814,8 +4815,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
   time and resort to the original setpathinfo level which takes the ancient
   DOS time format with 2 second granularity */
 int
-CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
-                    const FILE_BASIC_INFO *data, __u16 fid)
+                    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
        char *data_offset;
@@ -4828,11 +4829,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
        if (rc)
                return rc;
-        /* At this point there is no need to override the current pid
+        pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
-        with the pid of the opener, but that could change if we someday
+        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
-        use an existing handle (rather than opening one on the fly) */
-        /* pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
-        pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));*/
        params = 6;
        pSMB->MaxSetupCount = 0;
@@ -4880,9 +4878,9 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 int
-CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon, const char *fileName,
+CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
-                const FILE_BASIC_INFO *data,
+                   const char *fileName, const FILE_BASIC_INFO *data,
-                const struct nls_table *nls_codepage, int remap)
+                   const struct nls_table *nls_codepage, int remap)
 {
        TRANSACTION2_SPI_REQ *pSMB = NULL;
        TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5011,10 +5009,9 @@ SetAttrLgcyRetry:
 #endif /* temporarily unneeded SetAttr legacy function */
 int
-CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon,
+CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
-                    char *fileName, __u64 mode, __u64 uid, __u64 gid,
+                   const struct cifs_unix_set_info_args *args,
-                    dev_t device, const struct nls_table *nls_codepage,
+                   const struct nls_table *nls_codepage, int remap)
-                    int remap)
 {
        TRANSACTION2_SPI_REQ *pSMB = NULL;
        TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5023,6 +5020,7 @@ CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon,
        int bytes_returned = 0;
        FILE_UNIX_BASIC_INFO *data_offset;
        __u16 params, param_offset, offset, count, byte_count;
+        __u64 mode = args->mode;
        cFYI(1, ("In SetUID/GID/Mode"));
 setPermsRetry:
@@ -5078,16 +5076,16 @@ setPermsRetry:
        set file size and do not want to truncate file size to zero
        accidently as happened on one Samba server beta by putting
        zero instead of -1 here */
-        data_offset->EndOfFile = NO_CHANGE_64;
+        data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
-        data_offset->NumOfBytes = NO_CHANGE_64;
+        data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
-        data_offset->LastStatusChange = NO_CHANGE_64;
+        data_offset->LastStatusChange = cpu_to_le64(args->ctime);
-        data_offset->LastAccessTime = NO_CHANGE_64;
+        data_offset->LastAccessTime = cpu_to_le64(args->atime);
-        data_offset->LastModificationTime = NO_CHANGE_64;
+        data_offset->LastModificationTime = cpu_to_le64(args->mtime);
-        data_offset->Uid = cpu_to_le64(uid);
+        data_offset->Uid = cpu_to_le64(args->uid);
-        data_offset->Gid = cpu_to_le64(gid);
+        data_offset->Gid = cpu_to_le64(args->gid);
        /* better to leave device as zero when it is  */
-        data_offset->DevMajor = cpu_to_le64(MAJOR(device));
+        data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
-        data_offset->DevMinor = cpu_to_le64(MINOR(device));
+        data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
        data_offset->Permissions = cpu_to_le64(mode);
        if (S_ISREG(mode))
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e8fa46c7cff2..4c13bcdb92a5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -151,7 +151,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        }
        list_for_each(tmp, &GlobalTreeConnectionList) {
                tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-                if ((tcon) && (tcon->ses) && (tcon->ses->server == server))
+                if ((tcon->ses) && (tcon->ses->server == server))
                        tcon->tidStatus = CifsNeedReconnect;
        }
        read_unlock(&GlobalSMBSeslock);
@@ -173,14 +173,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
                mid_entry = list_entry(tmp, struct
                                        mid_q_entry,
                                        qhead);
-                if (mid_entry) {
+                if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-                        if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
                                /* Mark other intransit requests as needing
                                   retry so we do not immediately mark the
                                   session bad again (ie after we reconnect
                                   below) as they timeout too */
-                                mid_entry->midState = MID_RETRY_NEEDED;
+                        mid_entry->midState = MID_RETRY_NEEDED;
-                        }
                }
        }
        spin_unlock(&GlobalMid_Lock);
@@ -351,11 +349,9 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
        current->flags |= PF_MEMALLOC;
        cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
-        write_lock(&GlobalSMBSeslock);
-        atomic_inc(&tcpSesAllocCount);
+        length = atomic_inc_return(&tcpSesAllocCount);
-        length = tcpSesAllocCount.counter;
+        if (length > 1)
-        write_unlock(&GlobalSMBSeslock);
-        if (length  > 1)
                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
                                GFP_KERNEL);
@@ -455,7 +451,7 @@ incomplete_rcv:
                /* Note that FC 1001 length is big endian on the wire,
                but we convert it here so it is always manipulated
                as host byte order */
-                pdu_length = ntohl(smb_buffer->smb_buf_length);
+                pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
                smb_buffer->smb_buf_length = pdu_length;
                cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
@@ -745,14 +741,11 @@ multi_t2_fnd:
                coming home not much else we can do but free the memory */
        }
-        write_lock(&GlobalSMBSeslock);
-        atomic_dec(&tcpSesAllocCount);
-        length = tcpSesAllocCount.counter;
        /* last chance to mark ses pointers invalid
        if there are any pointing to this (e.g
        if a crazy root user tried to kill cifsd
        kernel thread explicitly this might happen) */
+        write_lock(&GlobalSMBSeslock);
        list_for_each(tmp, &GlobalSMBSessionList) {
                ses = list_entry(tmp, struct cifsSesInfo,
                                cifsSessionList);
@@ -763,6 +756,8 @@ multi_t2_fnd:
        kfree(server->hostname);
        kfree(server);
+        length = atomic_dec_return(&tcpSesAllocCount);
        if (length  > 0)
                mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
                                GFP_KERNEL);
@@ -1461,6 +1456,39 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
        return rc;
 }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key cifs_key[2];
+static struct lock_class_key cifs_slock_key[2];
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+        struct sock *sk = sock->sk;
+        BUG_ON(sock_owned_by_user(sk));
+        sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
+                &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
+}
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+        struct sock *sk = sock->sk;
+        BUG_ON(sock_owned_by_user(sk));
+        sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
+                &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
+}
+#else
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+}
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+}
+#endif
 /* See RFC1001 section 14 on representation of Netbios names */
 static void rfc1002mangle(char *target, char *source, unsigned int length)
 {
@@ -1495,6 +1523,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
                /* BB other socket options to set KEEPALIVE, NODELAY? */
                        cFYI(1, ("Socket created"));
                        (*csocket)->sk->sk_allocation = GFP_NOFS;
+                        cifs_reclassify_socket4(*csocket);
                }
        }
@@ -1627,6 +1656,7 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
                /* BB other socket options to set KEEPALIVE, NODELAY? */
                         cFYI(1, ("ipv6 Socket created"));
                        (*csocket)->sk->sk_allocation = GFP_NOFS;
+                        cifs_reclassify_socket6(*csocket);
                }
        }
@@ -3568,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
        char ntlm_session_key[CIFS_SESS_KEY_SIZE];
        bool ntlmv2_flag = false;
        int first_time = 0;
+        struct TCP_Server_Info *server = pSesInfo->server;
        /* what if server changes its buffer size after dropping the session? */
-        if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ {
+        if (server->maxBuf == 0) /* no need to send on reconnect */ {
                rc = CIFSSMBNegotiate(xid, pSesInfo);
-                if (rc == -EAGAIN) /* retry only once on 1st time connection */ {
+                if (rc == -EAGAIN) {
+                        /* retry only once on 1st time connection */
                        rc = CIFSSMBNegotiate(xid, pSesInfo);
                        if (rc == -EAGAIN)
                                rc = -EHOSTDOWN;
                }
                if (rc == 0) {
                        spin_lock(&GlobalMid_Lock);
-                        if (pSesInfo->server->tcpStatus != CifsExiting)
+                        if (server->tcpStatus != CifsExiting)
-                                pSesInfo->server->tcpStatus = CifsGood;
+                                server->tcpStatus = CifsGood;
                        else
                                rc = -EHOSTDOWN;
                        spin_unlock(&GlobalMid_Lock);
@@ -3588,97 +3620,90 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
                }
                first_time = 1;
        }
-        if (!rc) {
-                pSesInfo->flags = 0;
+        if (rc)
-                pSesInfo->capabilities = pSesInfo->server->capabilities;
+                goto ss_err_exit;
-                if (linuxExtEnabled == 0)
-                        pSesInfo->capabilities &= (~CAP_UNIX);
+        pSesInfo->flags = 0;
+        pSesInfo->capabilities = server->capabilities;
+        if (linuxExtEnabled == 0)
+                pSesInfo->capabilities &= (~CAP_UNIX);
        /*      pSesInfo->sequence_number = 0;*/
-                cFYI(1,
+        cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-                      ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+                 server->secMode, server->capabilities, server->timeAdj));
-                        pSesInfo->server->secMode,
-                        pSesInfo->server->capabilities,
+        if (experimEnabled < 2)
-                        pSesInfo->server->timeAdj));
+                rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
-                if (experimEnabled < 2)
+        else if (extended_security
-                        rc = CIFS_SessSetup(xid, pSesInfo,
+                        && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-                                            first_time, nls_info);
+                        && (server->secType == NTLMSSP)) {
-                else if (extended_security
+                rc = -EOPNOTSUPP;
-                                && (pSesInfo->capabilities
+        } else if (extended_security
-                                        & CAP_EXTENDED_SECURITY)
+                        && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-                                && (pSesInfo->server->secType == NTLMSSP)) {
+                        && (server->secType == RawNTLMSSP)) {
-                        rc = -EOPNOTSUPP;
+                cFYI(1, ("NTLMSSP sesssetup"));
-                } else if (extended_security
+                rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
-                           && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+                                                   nls_info);
-                           && (pSesInfo->server->secType == RawNTLMSSP)) {
+                if (!rc) {
-                        cFYI(1, ("NTLMSSP sesssetup"));
+                        if (ntlmv2_flag) {
-                        rc = CIFSNTLMSSPNegotiateSessSetup(xid,
+                                char *v2_response;
-                                                pSesInfo,
+                                cFYI(1, ("more secure NTLM ver2 hash"));
-                                                &ntlmv2_flag,
+                                if (CalcNTLMv2_partial_mac_key(pSesInfo,
-                                                nls_info);
+                                                                nls_info)) {
-                        if (!rc) {
+                                        rc = -ENOMEM;
-                                if (ntlmv2_flag) {
+                                        goto ss_err_exit;
-                                        char *v2_response;
+                                } else
-                                        cFYI(1, ("more secure NTLM ver2 hash"));
+                                        v2_response = kmalloc(16 + 64 /* blob*/,
-                                        if (CalcNTLMv2_partial_mac_key(pSesInfo,
+                                                                GFP_KERNEL);
-                                                nls_info)) {
+                                if (v2_response) {
-                                                rc = -ENOMEM;
+                                        CalcNTLMv2_response(pSesInfo,
-                                                goto ss_err_exit;
+                                                                v2_response);
-                                        } else
+                                /*      if (first_time)
-                                                v2_response = kmalloc(16 + 64 /* blob */, GFP_KERNEL);
+                                                cifs_calculate_ntlmv2_mac_key */
-                                        if (v2_response) {
+                                        kfree(v2_response);
-                                                CalcNTLMv2_response(pSesInfo,
-                                                                   v2_response);
-                                /*              if (first_time)
-                                                  cifs_calculate_ntlmv2_mac_key(
-                                                   pSesInfo->server->mac_signing_key,
-                                                   response, ntlm_session_key,*/
-                                                kfree(v2_response);
                                        /* BB Put dummy sig in SessSetup PDU? */
-                                        } else {
-                                                rc = -ENOMEM;
-                                                goto ss_err_exit;
-                                        }
                                } else {
-                                        SMBNTencrypt(pSesInfo->password,
+                                        rc = -ENOMEM;
-                                                pSesInfo->server->cryptKey,
+                                        goto ss_err_exit;
-                                                ntlm_session_key);
-                                        if (first_time)
-                                                cifs_calculate_mac_key(
-                                                        &pSesInfo->server->mac_signing_key,
-                                                        ntlm_session_key,
-                                                        pSesInfo->password);
                                }
+                        } else {
+                                SMBNTencrypt(pSesInfo->password,
+                                             server->cryptKey,
+                                             ntlm_session_key);
+                                if (first_time)
+                                        cifs_calculate_mac_key(
+                                             &server->mac_signing_key,
+                                             ntlm_session_key,
+                                             pSesInfo->password);
+                        }
                        /* for better security the weaker lanman hash not sent
                           in AuthSessSetup so we no longer calculate it */
-                                rc = CIFSNTLMSSPAuthSessSetup(xid,
+                        rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
-                                        pSesInfo,
+                                                      ntlm_session_key,
-                                        ntlm_session_key,
+                                                      ntlmv2_flag,
-                                        ntlmv2_flag,
+                                                      nls_info);
-                                        nls_info);
+                }
-                        }
+        } else { /* old style NTLM 0.12 session setup */
-                } else { /* old style NTLM 0.12 session setup */
+                SMBNTencrypt(pSesInfo->password, server->cryptKey,
-                        SMBNTencrypt(pSesInfo->password,
+                             ntlm_session_key);
-                                pSesInfo->server->cryptKey,
-                                ntlm_session_key);
-                        if (first_time)
+                if (first_time)
-                                cifs_calculate_mac_key(
+                        cifs_calculate_mac_key(&server->mac_signing_key,
-                                        &pSesInfo->server->mac_signing_key,
+                                                ntlm_session_key,
-                                        ntlm_session_key, pSesInfo->password);
+                                                pSesInfo->password);
-                        rc = CIFSSessSetup(xid, pSesInfo,
+                rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
-                                ntlm_session_key, nls_info);
+        }
-                }
+        if (rc) {
-                if (rc) {
+                cERROR(1, ("Send error in SessSetup = %d", rc));
-                        cERROR(1, ("Send error in SessSetup = %d", rc));
+        } else {
-                } else {
+                cFYI(1, ("CIFS Session Established successfully"));
-                        cFYI(1, ("CIFS Session Established successfully"));
                        pSesInfo->status = CifsGood;
-                }
        }
 ss_err_exit:
        return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index fb69c1fa85c9..e962e75e6f7b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -226,23 +226,28 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                /* If Open reported that we actually created a file
                then we now have to set the mode if possible */
                if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+                        struct cifs_unix_set_info_args args = {
+                                .mode   = mode,
+                                .ctime  = NO_CHANGE_64,
+                                .atime  = NO_CHANGE_64,
+                                .mtime  = NO_CHANGE_64,
+                                .device = 0,
+                        };
                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
+                                args.uid = (__u64) current->fsuid;
-                                        (__u64)current->fsuid,
+                                if (inode->i_mode & S_ISGID)
-                                        (__u64)current->fsgid,
+                                        args.gid = (__u64) inode->i_gid;
-                                        0 /* dev */,
+                                else
-                                        cifs_sb->local_nls,
+                                        args.gid = (__u64) current->fsgid;
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        } else {
-                                CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
+                                args.uid = NO_CHANGE_64;
-                                        (__u64)-1,
+                                args.gid = NO_CHANGE_64;
-                                        (__u64)-1,
-                                        0 /* dev */,
-                                        cifs_sb->local_nls,
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        }
+                        CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+                                cifs_sb->local_nls,
+                                cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
                        /* BB implement mode setting via Windows security
                           descriptors e.g. */
@@ -267,7 +272,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                                    (cifs_sb->mnt_cifs_flags &
                                     CIFS_MOUNT_SET_UID)) {
                                        newinode->i_uid = current->fsuid;
-                                        newinode->i_gid = current->fsgid;
+                                        if (inode->i_mode & S_ISGID)
+                                                newinode->i_gid =
+                                                        inode->i_gid;
+                                        else
+                                                newinode->i_gid =
+                                                        current->fsgid;
                                }
                        }
                }
@@ -357,21 +367,24 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
        if (full_path == NULL)
                rc = -ENOMEM;
        else if (pTcon->unix_ext) {
-                mode &= ~current->fs->umask;
+                struct cifs_unix_set_info_args args = {
+                        .mode   = mode & ~current->fs->umask,
+                        .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
+                        .device = device_number,
+                };
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                        rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path,
+                        args.uid = (__u64) current->fsuid;
-                                mode, (__u64)current->fsuid,
+                        args.gid = (__u64) current->fsgid;
-                                (__u64)current->fsgid,
-                                device_number, cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
-                        rc = CIFSSMBUnixSetPerms(xid, pTcon,
+                        args.uid = NO_CHANGE_64;
-                                full_path, mode, (__u64)-1, (__u64)-1,
+                        args.gid = NO_CHANGE_64;
-                                device_number, cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
                }
+                rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path,
+                        &args, cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                if (!rc) {
                        rc = cifs_get_inode_info_unix(&newinode, full_path,
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index f730ef35499e..a2e0673e1b08 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -47,11 +47,18 @@ static int dns_resolver_instantiate(struct key *key, const void *data,
        return rc;
 }
+static void
+dns_resolver_destroy(struct key *key)
+{
+        kfree(key->payload.data);
+}
 struct key_type key_type_dns_resolver = {
        .name        = "dns_resolver",
        .def_datalen = sizeof(struct in_addr),
        .describe    = user_describe,
        .instantiate = dns_resolver_instantiate,
+        .destroy     = dns_resolver_destroy,
        .match       = user_match,
 };
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0aac824371a5..cbefe1f1f9fe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -310,18 +310,19 @@ int cifs_open(struct inode *inode, struct file *file)
                /* time to set mode which we can not set earlier due to
                   problems creating new read-only files */
                if (pTcon->unix_ext) {
-                        CIFSSMBUnixSetPerms(xid, pTcon, full_path,
+                        struct cifs_unix_set_info_args args = {
-                                            inode->i_mode,
+                                .mode   = inode->i_mode,
-                                            (__u64)-1, (__u64)-1, 0 /* dev */,
+                                .uid    = NO_CHANGE_64,
+                                .gid    = NO_CHANGE_64,
+                                .ctime  = NO_CHANGE_64,
+                                .atime  = NO_CHANGE_64,
+                                .mtime  = NO_CHANGE_64,
+                                .device = 0,
+                        };
+                        CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
                                            cifs_sb->local_nls,
                                            cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                } else {
-                        /* BB implement via Windows security descriptors eg
-                           CIFSSMBWinSetPerms(xid, pTcon, full_path, mode,
-                                              -1, -1, local_nls);
-                           in the meantime could set r/o dos attribute when
-                           perms are eg: mode & 0222 == 0 */
                }
        }
@@ -832,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                return -EBADF;
        open_file = (struct cifsFileInfo *) file->private_data;
+        rc = generic_write_checks(file, poffset, &write_size, 0);
+        if (rc)
+                return rc;
        xid = GetXid();
        if (*poffset > file->f_path.dentry->d_inode->i_size)
@@ -1280,7 +1285,7 @@ retry:
                        if (first < 0)
                                lock_page(page);
-                        else if (TestSetPageLocked(page))
+                        else if (!trylock_page(page))
                                break;
                        if (unlikely(page->mapping != mapping)) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2e904bd111c8..9c548f110102 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode,
                if ((inode->i_mode & S_IWUGO) == 0 &&
                    (attr & ATTR_READONLY) == 0)
                        inode->i_mode |= (S_IWUGO & default_mode);
-                        inode->i_mode &= ~S_IFMT;
+                inode->i_mode &= ~S_IFMT;
        }
        /* clear write bits if ATTR_READONLY is set */
        if (attr & ATTR_READONLY)
@@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                inode->i_fop = &simple_dir_operations;
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
+        } else if (rc) {
                _FreeXid(xid);
                iget_failed(inode);
                return ERR_PTR(rc);
@@ -737,7 +739,7 @@ psx_del_no_retry:
                        /* ATTRS set to normal clears r/o bit */
                        pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
                        if (!(pTcon->ses->flags & CIFS_SES_NT4))
-                                rc = CIFSSMBSetTimes(xid, pTcon, full_path,
+                                rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
                                                     pinfo_buf,
                                                     cifs_sb->local_nls,
                                                     cifs_sb->mnt_cifs_flags &
@@ -767,9 +769,10 @@ psx_del_no_retry:
                                                 cifs_sb->mnt_cifs_flags &
                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
                                if (rc == 0) {
-                                        rc = CIFSSMBSetFileTimes(xid, pTcon,
+                                        rc = CIFSSMBSetFileInfo(xid, pTcon,
-                                                                 pinfo_buf,
+                                                                pinfo_buf,
-                                                                 netfid);
+                                                                netfid,
+                                                                current->tgid);
                                        CIFSSMBClose(xid, pTcon, netfid);
                                }
                        }
@@ -984,32 +987,41 @@ mkdir_get_info:
                  * failed to get it from the server or was set bogus */
                if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
                                direntry->d_inode->i_nlink = 2;
                mode &= ~current->fs->umask;
+                /* must turn on setgid bit if parent dir has it */
+                if (inode->i_mode & S_ISGID)
+                        mode |= S_ISGID;
                if (pTcon->unix_ext) {
+                        struct cifs_unix_set_info_args args = {
+                                .mode   = mode,
+                                .ctime  = NO_CHANGE_64,
+                                .atime  = NO_CHANGE_64,
+                                .mtime  = NO_CHANGE_64,
+                                .device = 0,
+                        };
                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                CIFSSMBUnixSetPerms(xid, pTcon, full_path,
+                                args.uid = (__u64)current->fsuid;
-                                                    mode,
+                                if (inode->i_mode & S_ISGID)
-                                                    (__u64)current->fsuid,
+                                        args.gid = (__u64)inode->i_gid;
-                                                    (__u64)current->fsgid,
+                                else
-                                                    0 /* dev_t */,
+                                        args.gid = (__u64)current->fsgid;
-                                                    cifs_sb->local_nls,
-                                                    cifs_sb->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
                        } else {
-                                CIFSSMBUnixSetPerms(xid, pTcon, full_path,
+                                args.uid = NO_CHANGE_64;
-                                                    mode, (__u64)-1,
+                                args.gid = NO_CHANGE_64;
-                                                    (__u64)-1, 0 /* dev_t */,
-                                                    cifs_sb->local_nls,
-                                                    cifs_sb->mnt_cifs_flags &
-                                                    CIFS_MOUNT_MAP_SPECIAL_CHR);
                        }
+                        CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+                                            cifs_sb->local_nls,
+                                            cifs_sb->mnt_cifs_flags &
+                                            CIFS_MOUNT_MAP_SPECIAL_CHR);
                } else {
                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
                            (mode & S_IWUGO) == 0) {
                                FILE_BASIC_INFO pInfo;
                                memset(&pInfo, 0, sizeof(pInfo));
                                pInfo.Attributes = cpu_to_le32(ATTR_READONLY);
-                                CIFSSMBSetTimes(xid, pTcon, full_path,
+                                CIFSSMBSetPathInfo(xid, pTcon, full_path,
                                                &pInfo, cifs_sb->local_nls,
                                                cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -1024,8 +1036,12 @@ mkdir_get_info:
                                     CIFS_MOUNT_SET_UID) {
                                        direntry->d_inode->i_uid =
                                                current->fsuid;
-                                        direntry->d_inode->i_gid =
+                                        if (inode->i_mode & S_ISGID)
-                                                current->fsgid;
+                                                direntry->d_inode->i_gid =
+                                                        inode->i_gid;
+                                        else
+                                                direntry->d_inode->i_gid =
+                                                        current->fsgid;
                                }
                        }
                }
@@ -1310,10 +1326,11 @@ int cifs_revalidate(struct dentry *direntry)
 /*              if (S_ISDIR(direntry->d_inode->i_mode))
                        shrink_dcache_parent(direntry); */
                if (S_ISREG(direntry->d_inode->i_mode)) {
-                        if (direntry->d_inode->i_mapping)
+                        if (direntry->d_inode->i_mapping) {
                                wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
                                if (wbrc)
                                        CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
+                        }
                        /* may eventually have to do this for open files too */
                        if (list_empty(&(cifsInode->openFileList))) {
                                /* changed on server - flush read ahead pages */
@@ -1413,31 +1430,304 @@ out_busy:
        return -ETXTBSY;
 }
-int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+static int
+cifs_set_file_size(struct inode *inode, struct iattr *attrs,
+                   int xid, char *full_path)
 {
+        int rc;
+        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        /*
+         * To avoid spurious oplock breaks from server, in the case of
+         * inodes that we already have open, avoid doing path based
+         * setting of file size if we can do it by handle.
+         * This keeps our caching token (oplock) and avoids timeouts
+         * when the local oplock break takes longer to flush
+         * writebehind data than the SMB timeout for the SetPathInfo
+         * request would allow
+         */
+        open_file = find_writable_file(cifsInode);
+        if (open_file) {
+                __u16 nfid = open_file->netfid;
+                __u32 npid = open_file->pid;
+                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
+                                        npid, false);
+                atomic_dec(&open_file->wrtPending);
+                cFYI(1, ("SetFSize for attrs rc = %d", rc));
+                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+                        unsigned int bytes_written;
+                        rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
+                                          &bytes_written, NULL, NULL, 1);
+                        cFYI(1, ("Wrt seteof rc %d", rc));
+                }
+        } else
+                rc = -EINVAL;
+        if (rc != 0) {
+                /* Set file size by pathname rather than by handle
+                   either because no valid, writeable file handle for
+                   it was found or because there was an error setting
+                   it by handle */
+                rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,
+                                   false, cifs_sb->local_nls,
+                                   cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+                        __u16 netfid;
+                        int oplock = 0;
+                        rc = SMBLegacyOpen(xid, pTcon, full_path,
+                                FILE_OPEN, GENERIC_WRITE,
+                                CREATE_NOT_DIR, &netfid, &oplock, NULL,
+                                cifs_sb->local_nls,
+                                cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                        if (rc == 0) {
+                                unsigned int bytes_written;
+                                rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
+                                                  attrs->ia_size,
+                                                  &bytes_written, NULL,
+                                                  NULL, 1);
+                                cFYI(1, ("wrt seteof rc %d", rc));
+                                CIFSSMBClose(xid, pTcon, netfid);
+                        }
+                }
+        }
+        if (rc == 0) {
+                rc = cifs_vmtruncate(inode, attrs->ia_size);
+                cifs_truncate_page(inode->i_mapping, inode->i_size);
+        }
+        return rc;
+}
+static int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
+                    char *full_path, __u32 dosattr)
+{
+        int rc;
+        int oplock = 0;
+        __u16 netfid;
+        __u32 netpid;
+        bool set_time = false;
+        struct cifsFileInfo *open_file;
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        FILE_BASIC_INFO info_buf;
+        if (attrs->ia_valid & ATTR_ATIME) {
+                set_time = true;
+                info_buf.LastAccessTime =
+                        cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+        } else
+                info_buf.LastAccessTime = 0;
+        if (attrs->ia_valid & ATTR_MTIME) {
+                set_time = true;
+                info_buf.LastWriteTime =
+                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+        } else
+                info_buf.LastWriteTime = 0;
+        /*
+         * Samba throws this field away, but windows may actually use it.
+         * Do not set ctime unless other time stamps are changed explicitly
+         * (i.e. by utimes()) since we would then have a mix of client and
+         * server times.
+         */
+        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+                cFYI(1, ("CIFS - CTIME changed"));
+                info_buf.ChangeTime =
+                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+        } else
+                info_buf.ChangeTime = 0;
+        info_buf.CreationTime = 0;      /* don't change */
+        info_buf.Attributes = cpu_to_le32(dosattr);
+        /*
+         * If the file is already open for write, just use that fileid
+         */
+        open_file = find_writable_file(cifsInode);
+        if (open_file) {
+                netfid = open_file->netfid;
+                netpid = open_file->pid;
+                goto set_via_filehandle;
+        }
+        /*
+         * NT4 apparently returns success on this call, but it doesn't
+         * really work.
+         */
+        if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
+                rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+                                     &info_buf, cifs_sb->local_nls,
+                                     cifs_sb->mnt_cifs_flags &
+                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
+                if (rc != -EOPNOTSUPP && rc != -EINVAL)
+                        goto out;
+        }
+        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+                 "times not supported by this server"));
+        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
+                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+                         CREATE_NOT_DIR, &netfid, &oplock,
+                         NULL, cifs_sb->local_nls,
+                         cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc != 0) {
+                if (rc == -EIO)
+                        rc = -EINVAL;
+                goto out;
+        }
+        netpid = current->tgid;
+set_via_filehandle:
+        rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+        if (open_file == NULL)
+                CIFSSMBClose(xid, pTcon, netfid);
+        else
+                atomic_dec(&open_file->wrtPending);
+out:
+        return rc;
+}
+static int
+cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
+{
+        int rc;
        int xid;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
        char *full_path = NULL;
-        int rc = -EACCES;
-        struct cifsFileInfo *open_file = NULL;
-        FILE_BASIC_INFO time_buf;
-        bool set_time = false;
-        bool set_dosattr = false;
-        __u64 mode = 0xFFFFFFFFFFFFFFFFULL;
-        __u64 uid = 0xFFFFFFFFFFFFFFFFULL;
-        __u64 gid = 0xFFFFFFFFFFFFFFFFULL;
-        struct cifsInodeInfo *cifsInode;
        struct inode *inode = direntry->d_inode;
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        struct cifs_unix_set_info_args *args = NULL;
+        cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
+                 direntry->d_name.name, attrs->ia_valid));
+        xid = GetXid();
+        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
+                /* check if we have permission to change attrs */
+                rc = inode_change_ok(inode, attrs);
+                if (rc < 0)
+                        goto out;
+                else
+                        rc = 0;
+        }
+        full_path = build_path_from_dentry(direntry);
+        if (full_path == NULL) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+                /*
+                   Flush data before changing file size or changing the last
+                   write time of the file on the server. If the
+                   flush returns error, store it to report later and continue.
+                   BB: This should be smarter. Why bother flushing pages that
+                   will be truncated anyway? Also, should we error out here if
+                   the flush returns error?
+                 */
+                rc = filemap_write_and_wait(inode->i_mapping);
+                if (rc != 0) {
+                        cifsInode->write_behind_rc = rc;
+                        rc = 0;
+                }
+        }
+        if (attrs->ia_valid & ATTR_SIZE) {
+                rc = cifs_set_file_size(inode, attrs, xid, full_path);
+                if (rc != 0)
+                        goto out;
+        }
+        /* skip mode change if it's just for clearing setuid/setgid */
+        if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+                attrs->ia_valid &= ~ATTR_MODE;
+        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        if (args == NULL) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        /* set up the struct */
+        if (attrs->ia_valid & ATTR_MODE)
+                args->mode = attrs->ia_mode;
+        else
+                args->mode = NO_CHANGE_64;
+        if (attrs->ia_valid & ATTR_UID)
+                args->uid = attrs->ia_uid;
+        else
+                args->uid = NO_CHANGE_64;
+        if (attrs->ia_valid & ATTR_GID)
+                args->gid = attrs->ia_gid;
+        else
+                args->gid = NO_CHANGE_64;
+        if (attrs->ia_valid & ATTR_ATIME)
+                args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
+        else
+                args->atime = NO_CHANGE_64;
+        if (attrs->ia_valid & ATTR_MTIME)
+                args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime);
+        else
+                args->mtime = NO_CHANGE_64;
+        if (attrs->ia_valid & ATTR_CTIME)
+                args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime);
+        else
+                args->ctime = NO_CHANGE_64;
+        args->device = 0;
+        rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args,
+                                cifs_sb->local_nls,
+                                cifs_sb->mnt_cifs_flags &
+                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (!rc)
+                rc = inode_setattr(inode, attrs);
+out:
+        kfree(args);
+        kfree(full_path);
+        FreeXid(xid);
+        return rc;
+}
+static int
+cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
+{
+        int xid;
+        struct inode *inode = direntry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+        char *full_path = NULL;
+        int rc = -EACCES;
+        __u32 dosattr = 0;
+        __u64 mode = NO_CHANGE_64;
        xid = GetXid();
        cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
                 direntry->d_name.name, attrs->ia_valid));
-        cifs_sb = CIFS_SB(inode->i_sb);
-        pTcon = cifs_sb->tcon;
        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
                /* check if we have permission to change attrs */
                rc = inode_change_ok(inode, attrs);
@@ -1453,7 +1743,6 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                FreeXid(xid);
                return -ENOMEM;
        }
-        cifsInode = CIFS_I(inode);
        if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
                /*
@@ -1472,78 +1761,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
        }
        if (attrs->ia_valid & ATTR_SIZE) {
-                /* To avoid spurious oplock breaks from server, in the case of
+                rc = cifs_set_file_size(inode, attrs, xid, full_path);
-                   inodes that we already have open, avoid doing path based
+                if (rc != 0)
-                   setting of file size if we can do it by handle.
-                   This keeps our caching token (oplock) and avoids timeouts
-                   when the local oplock break takes longer to flush
-                   writebehind data than the SMB timeout for the SetPathInfo
-                   request would allow */
-                open_file = find_writable_file(cifsInode);
-                if (open_file) {
-                        __u16 nfid = open_file->netfid;
-                        __u32 npid = open_file->pid;
-                        rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size,
-                                                nfid, npid, false);
-                        atomic_dec(&open_file->wrtPending);
-                        cFYI(1, ("SetFSize for attrs rc = %d", rc));
-                        if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-                                unsigned int bytes_written;
-                                rc = CIFSSMBWrite(xid, pTcon,
-                                                  nfid, 0, attrs->ia_size,
-                                                  &bytes_written, NULL, NULL,
-                                                  1 /* 45 seconds */);
-                                cFYI(1, ("Wrt seteof rc %d", rc));
-                        }
-                } else
-                        rc = -EINVAL;
-                if (rc != 0) {
-                        /* Set file size by pathname rather than by handle
-                           either because no valid, writeable file handle for
-                           it was found or because there was an error setting
-                           it by handle */
-                        rc = CIFSSMBSetEOF(xid, pTcon, full_path,
-                                           attrs->ia_size, false,
-                                           cifs_sb->local_nls,
-                                           cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
-                        if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-                                __u16 netfid;
-                                int oplock = 0;
-                                rc = SMBLegacyOpen(xid, pTcon, full_path,
-                                        FILE_OPEN, GENERIC_WRITE,
-                                        CREATE_NOT_DIR, &netfid, &oplock,
-                                        NULL, cifs_sb->local_nls,
-                                        cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                                if (rc == 0) {
-                                        unsigned int bytes_written;
-                                        rc = CIFSSMBWrite(xid, pTcon,
-                                                        netfid, 0,
-                                                        attrs->ia_size,
-                                                        &bytes_written, NULL,
-                                                        NULL, 1 /* 45 sec */);
-                                        cFYI(1, ("wrt seteof rc %d", rc));
-                                        CIFSSMBClose(xid, pTcon, netfid);
-                                }
-                        }
-                }
-                /* Server is ok setting allocation size implicitly - no need
-                   to call:
-                CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, true,
-                         cifs_sb->local_nls);
-                   */
-                if (rc == 0) {
-                        rc = cifs_vmtruncate(inode, attrs->ia_size);
-                        cifs_truncate_page(inode->i_mapping, inode->i_size);
-                } else
                        goto cifs_setattr_exit;
        }
@@ -1554,21 +1773,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
         * CIFSACL support + proper Windows to Unix idmapping, we may be
         * able to support this in the future.
         */
-        if (!pTcon->unix_ext &&
+        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
-            !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
                attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
-        } else {
-                if (attrs->ia_valid & ATTR_UID) {
-                        cFYI(1, ("UID changed to %d", attrs->ia_uid));
-                        uid = attrs->ia_uid;
-                }
-                if (attrs->ia_valid & ATTR_GID) {
-                        cFYI(1, ("GID changed to %d", attrs->ia_gid));
-                        gid = attrs->ia_gid;
-                }
-        }
-        time_buf.Attributes = 0;
        /* skip mode change if it's just for clearing setuid/setgid */
        if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
@@ -1579,13 +1785,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                mode = attrs->ia_mode;
        }
-        if ((pTcon->unix_ext)
+        if (attrs->ia_valid & ATTR_MODE) {
-            && (attrs->ia_valid & (ATTR_MODE | ATTR_GID | ATTR_UID)))
-                rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, uid, gid,
-                                         0 /* dev_t */, cifs_sb->local_nls,
-                                         cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-        else if (attrs->ia_valid & ATTR_MODE) {
                rc = 0;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
@@ -1594,24 +1794,19 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #endif
                if (((mode & S_IWUGO) == 0) &&
                    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
-                        set_dosattr = true;
-                        time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
+                        dosattr = cifsInode->cifsAttrs | ATTR_READONLY;
-                                                          ATTR_READONLY);
                        /* fix up mode if we're not using dynperm */
                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
                                attrs->ia_mode = inode->i_mode & ~S_IWUGO;
                } else if ((mode & S_IWUGO) &&
                           (cifsInode->cifsAttrs & ATTR_READONLY)) {
-                        /* If file is readonly on server, we would
-                        not be able to write to it - so if any write
+                        dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
-                        bit is enabled for user or group or other we
+                        /* Attributes of 0 are ignored */
-                        need to at least try to remove r/o dos attr */
+                        if (dosattr == 0)
-                        set_dosattr = true;
+                                dosattr |= ATTR_NORMAL;
-                        time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
-                                            (~ATTR_READONLY));
-                        /* Windows ignores set to zero */
-                        if (time_buf.Attributes == 0)
-                                time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
                        /* reset local inode permissions to normal */
                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
@@ -1629,82 +1824,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
                }
        }
-        if (attrs->ia_valid & ATTR_ATIME) {
+        if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) ||
-                set_time = true;
+            ((attrs->ia_valid & ATTR_MODE) && dosattr)) {
-                time_buf.LastAccessTime =
+                rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
-                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+                /* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */
-        } else
-                time_buf.LastAccessTime = 0;
-        if (attrs->ia_valid & ATTR_MTIME) {
-                set_time = true;
-                time_buf.LastWriteTime =
-                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
-        } else
-                time_buf.LastWriteTime = 0;
-        /* Do not set ctime explicitly unless other time
-           stamps are changed explicitly (i.e. by utime()
-           since we would then have a mix of client and
-           server times */
-        if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-                set_time = true;
-                /* Although Samba throws this field away
-                it may be useful to Windows - but we do
-                not want to set ctime unless some other
-                timestamp is changing */
-                cFYI(1, ("CIFS - CTIME changed"));
-                time_buf.ChangeTime =
-                    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
-        } else
-                time_buf.ChangeTime = 0;
-        if (set_time || set_dosattr) {
-                time_buf.CreationTime = 0;      /* do not change */
-                /* In the future we should experiment - try setting timestamps
-                   via Handle (SetFileInfo) instead of by path */
-                if (!(pTcon->ses->flags & CIFS_SES_NT4))
-                        rc = CIFSSMBSetTimes(xid, pTcon, full_path, &time_buf,
-                                             cifs_sb->local_nls,
-                                             cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                else
-                        rc = -EOPNOTSUPP;
-                if (rc == -EOPNOTSUPP) {
-                        int oplock = 0;
-                        __u16 netfid;
-                        cFYI(1, ("calling SetFileInfo since SetPathInfo for "
-                                 "times not supported by this server"));
-                        /* BB we could scan to see if we already have it open
-                           and pass in pid of opener to function */
-                        rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-                                         SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
-                                         CREATE_NOT_DIR, &netfid, &oplock,
-                                         NULL, cifs_sb->local_nls,
-                                         cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                        if (rc == 0) {
-                                rc = CIFSSMBSetFileTimes(xid, pTcon, &time_buf,
-                                                         netfid);
-                                CIFSSMBClose(xid, pTcon, netfid);
-                        } else {
-                        /* BB For even older servers we could convert time_buf
-                           into old DOS style which uses two second
-                           granularity */
-                        /* rc = CIFSSMBSetTimesLegacy(xid, pTcon, full_path,
-                                        &time_buf, cifs_sb->local_nls); */
-                        }
-                }
                /* Even if error on time set, no sense failing the call if
                the server would set the time to a reasonable value anyway,
                and this check ensures that we are not being called from
                sys_utimes in which case we ought to fail the call back to
                the user when the server rejects the call */
                if ((rc) && (attrs->ia_valid &
-                         (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
+                                (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
                        rc = 0;
        }
@@ -1718,6 +1849,21 @@ cifs_setattr_exit:
        return rc;
 }
+int
+cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+        struct inode *inode = direntry->d_inode;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsTconInfo *pTcon = cifs_sb->tcon;
+        if (pTcon->unix_ext)
+                return cifs_setattr_unix(direntry, attrs);
+        return cifs_setattr_nounix(direntry, attrs);
+        /* BB: add cifs_setattr_legacy for really old servers */
+}
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 83f306954883..5f40ed3473f5 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -690,6 +690,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
                        else
                                cifs_buf_release(cifsFile->srch_inf.
                                                ntwrk_buf_start);
+                        cifsFile->srch_inf.ntwrk_buf_start = NULL;
                }
                rc = initiate_cifs_search(xid, file);
                if (rc) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed150efbe27c..252fdc0567f1 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
                /* no capabilities flags in old lanman negotiation */
                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
@@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
                } else
                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == Kerberos) {
+        } else if (type == Kerberos || type == MSKerberos) {
 #ifdef CONFIG_CIFS_UPCALL
                struct cifs_spnego_msg *msg;
                spnego_key = cifs_get_spnego_key(ses);
@@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                }
                msg = spnego_key->payload.data;
+                /* check version field to make sure that cifs.upcall is
+                   sending us a response in an expected form */
+                if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+                        cERROR(1, ("incorrect version of cifs.upcall (expected"
+                                   " %d but got %d)",
+                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+                        rc = -EKEYREJECTED;
+                        goto ssetup_exit;
+                }
                /* bail out if key is too long */
                if (msg->sesskey_len >
                    sizeof(ses->server->mac_signing_key.data.krb5)) {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 000ac509c98a..e286db9f5ee2 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -265,6 +265,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
        cFYI(1, ("Sending smb:  total_len %d", total_len));
        dump_smb(smb_buffer, len);
+        i = 0;
        while (total_len) {
                rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
                                    n_vec - first_vec, total_len);
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e1c854890f94..bf4a3fd3c8e3 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -28,11 +28,9 @@ int coda_fake_statfs;
 char * coda_f2s(struct CodaFid *f)
 {
        static char s[60];
-#ifdef CONFIG_CODA_FS_OLD_API
-        sprintf(s, "(%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2]);
-#else
        sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]);
-#endif
        return s;
 }
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 3d2580e00a3e..c5916228243c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -137,9 +137,11 @@ exit:
 }
-int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
+int coda_permission(struct inode *inode, int mask)
 {
        int error = 0;
+        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
        if (!mask)
                return 0; 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc70083..830f51abb971 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
        kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct coda_inode_info *ei = (struct coda_inode_info *) foo;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c21a1f552a63..c51365422aa8 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,8 +24,7 @@
 #include <linux/coda_psdev.h>
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
+static int coda_ioctl_permission(struct inode *inode, int mask);
-                                 struct nameidata *nd);
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                       unsigned int cmd, unsigned long user_data);
@@ -42,8 +41,7 @@ const struct file_operations coda_ioctl_operations = {
 };
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
+static int coda_ioctl_permission(struct inode *inode, int mask)
-                                 struct nameidata *nd)
 {
        return 0;
 }
@@ -51,7 +49,7 @@ static int coda_ioctl_permission(struct inode *inode, int mask,
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                       unsigned int cmd, unsigned long user_data)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
        struct PioctlData data;
        struct inode *target_inode = NULL;
@@ -66,21 +64,21 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
         * Look up the pathname. Note that the pathname is in 
         * user memory, and namei takes care of this
         */
-        if ( data.follow ) {
+        if (data.follow) {
-                error = user_path_walk(data.path, &nd);
+                error = user_path(data.path, &path);
        } else {
-                error = user_path_walk_link(data.path, &nd);
+                error = user_lpath(data.path, &path);
        }
                
        if ( error ) {
                return error;
        } else {
-                target_inode = nd.path.dentry->d_inode;
+                target_inode = path.dentry->d_inode;
        }
        
        /* return if it is not a Coda inode */
        if ( target_inode->i_sb != inode->i_sb ) {
-                path_put(&nd.path);
+                path_put(&path);
                return  -EINVAL;
        }
@@ -89,7 +87,7 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 40c36f7352a6..0d9b80ec689c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -378,11 +378,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
 MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
 MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
 MODULE_LICENSE("GPL");
-#ifdef CONFIG_CODA_FS_OLD_API
-MODULE_VERSION("5.3.21");
-#else
 MODULE_VERSION("6.6");
-#endif
 static int __init init_coda(void)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 359e531094dd..ce432bca95d1 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -52,12 +52,8 @@ static void *alloc_upcall(int opcode, int size)
        inp->ih.opcode = opcode;
        inp->ih.pid = current->pid;
        inp->ih.pgid = task_pgrp_nr(current);
-#ifdef CONFIG_CODA_FS_OLD_API
-        memset(&inp->ih.cred, 0, sizeof(struct coda_cred));
-        inp->ih.cred.cr_fsuid = current->fsuid;
-#else
        inp->ih.uid = current->fsuid;
-#endif
        return (void*)inp;
 }
@@ -166,20 +162,11 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
        union inputArgs *inp;
        union outputArgs *outp;
        int insize, outsize, error;
-#ifdef CONFIG_CODA_FS_OLD_API
-        struct coda_cred cred = { 0, };
-        cred.cr_fsuid = uid;
-#endif
        
        insize = SIZE(release);
        UPARG(CODA_CLOSE);
        
-#ifdef CONFIG_CODA_FS_OLD_API
-        memcpy(&(inp->ih.cred), &cred, sizeof(cred));
-#else
        inp->ih.uid = uid;
-#endif
-        
        inp->coda_close.VFid = *fid;
        inp->coda_close.flags = flags;
diff --git a/fs/compat.c b/fs/compat.c
index 106eba28ec5a..075d0509970d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -234,18 +234,18 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 * The following statfs calls are copies of code from fs/open.c and
 * should be checked against those from time to time
 */
-asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs __user *buf)
+asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (!error) {
                struct kstatfs tmp;
-                error = vfs_statfs(nd.path.dentry, &tmp);
+                error = vfs_statfs(path.dentry, &tmp);
                if (!error)
                        error = put_compat_statfs(buf, &tmp);
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
@@ -299,21 +299,21 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
        return 0;
 }
-asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, struct compat_statfs64 __user *buf)
+asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (!error) {
                struct kstatfs tmp;
-                error = vfs_statfs(nd.path.dentry, &tmp);
+                error = vfs_statfs(path.dentry, &tmp);
                if (!error)
                        error = put_compat_statfs64(buf, &tmp);
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
@@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
        if (buf->result)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->result = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        buf->result++;
        dirent = buf->dirent;
        if (!access_ok(VERIFY_WRITE, dirent,
@@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
        if (reclen > buf->count)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->error = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        dirent = buf->previous;
        if (dirent) {
                if (__put_user(offset, &dirent->d_off))
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 18e2c548161d..5235c67e7594 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/raid/md.h>
 #include <linux/kd.h>
-#include <linux/dirent.h>
 #include <linux/route.h>
 #include <linux/in6.h>
 #include <linux/ipv6_route.h>
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da015c12e3ea..762d287123ca 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -49,8 +49,10 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DEFAULT   0x0080
 #define CONFIGFS_USET_DROPPING  0x0100
 #define CONFIGFS_USET_IN_MKDIR  0x0200
+#define CONFIGFS_USET_CREATING  0x0400
 #define CONFIGFS_NOT_PINNED     (CONFIGFS_ITEM_ATTR)
+extern struct mutex configfs_symlink_mutex;
 extern spinlock_t configfs_dirent_lock;
 extern struct vfsmount * configfs_mount;
@@ -66,6 +68,7 @@ extern void configfs_inode_exit(void);
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
                                struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
 extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
 extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 179589be063a..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -185,7 +185,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
        error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
        if (!error)
                error = configfs_make_dirent(p->d_fsdata, d, k, mode,
-                                             CONFIGFS_DIR);
+                                             CONFIGFS_DIR | CONFIGFS_USET_CREATING);
        if (!error) {
                error = configfs_create(d, mode, init_dir);
                if (!error) {
@@ -209,6 +209,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
 *      configfs_create_dir - create a directory for an config_item.
 *      @item:          config_itemwe're creating directory for.
 *      @dentry:        config_item's dentry.
+ *
+ *      Note: user-created entries won't be allowed under this new directory
+ *      until it is validated by configfs_dir_set_ready()
 */
 static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
@@ -231,6 +234,44 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
        return error;
 }
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd          configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+        struct configfs_dirent *child_sd;
+        sd->s_type &= ~CONFIGFS_USET_CREATING;
+        list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+                if (child_sd->s_type & CONFIGFS_USET_CREATING)
+                        configfs_dir_set_ready(child_sd);
+}
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd          configfs_dirent of the directory to check
+ *
+ * @return      non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+        int ret;
+        spin_lock(&configfs_dirent_lock);
+        ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+        spin_unlock(&configfs_dirent_lock);
+        return ret;
+}
 int configfs_create_link(struct configfs_symlink *sl,
                         struct dentry *parent,
                         struct dentry *dentry)
@@ -283,6 +324,8 @@ static void remove_dir(struct dentry * d)
 * The only thing special about this is that we remove any files in
 * the directory before we remove the directory, and we've inlined
 * what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
 */
 static void configfs_remove_dir(struct config_item * item)
@@ -330,7 +373,19 @@ static struct dentry * configfs_lookup(struct inode *dir,
        struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
        struct configfs_dirent * sd;
        int found = 0;
-        int err = 0;
+        int err;
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         *
+         * This forbids userspace to read/write attributes of items which may
+         * not complete their initialization, since the dentries of the
+         * attributes won't be instantiated.
+         */
+        err = -ENOENT;
+        if (!configfs_dirent_is_ready(parent_sd))
+                goto out;
        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
                if (sd->s_type & CONFIGFS_NOT_PINNED) {
@@ -353,6 +408,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
                return simple_lookup(dir, dentry, nd);
        }
+out:
        return ERR_PTR(err);
 }
@@ -370,13 +426,17 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
        struct configfs_dirent *sd;
        int ret;
+        /* Mark that we're trying to drop the group */
+        parent_sd->s_type |= CONFIGFS_USET_DROPPING;
        ret = -EBUSY;
        if (!list_empty(&parent_sd->s_links))
                goto out;
        ret = 0;
        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
-                if (sd->s_type & CONFIGFS_NOT_PINNED)
+                if (!sd->s_element ||
+                    (sd->s_type & CONFIGFS_NOT_PINNED))
                        continue;
                if (sd->s_type & CONFIGFS_USET_DEFAULT) {
                        /* Abort if racing with mkdir() */
@@ -385,8 +445,6 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
                                        *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
                                return -EAGAIN;
                        }
-                        /* Mark that we're trying to drop the group */
-                        sd->s_type |= CONFIGFS_USET_DROPPING;
                        /*
                         * Yup, recursive.  If there's a problem, blame
@@ -414,12 +472,11 @@ static void configfs_detach_rollback(struct dentry *dentry)
        struct configfs_dirent *parent_sd = dentry->d_fsdata;
        struct configfs_dirent *sd;
-        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+        parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
-                if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+        list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+                if (sd->s_type & CONFIGFS_USET_DEFAULT)
                        configfs_detach_rollback(sd->s_dentry);
-                        sd->s_type &= ~CONFIGFS_USET_DROPPING;
-                }
-        }
 }
 static void detach_attrs(struct config_item * item)
@@ -558,36 +615,21 @@ static int create_default_group(struct config_group *parent_group,
 static int populate_groups(struct config_group *group)
 {
        struct config_group *new_group;
-        struct dentry *dentry = group->cg_item.ci_dentry;
        int ret = 0;
        int i;
        if (group->default_groups) {
-                /*
-                 * FYI, we're faking mkdir here
-                 * I'm not sure we need this semaphore, as we're called
-                 * from our parent's mkdir.  That holds our parent's
-                 * i_mutex, so afaik lookup cannot continue through our
-                 * parent to find us, let alone mess with our tree.
-                 * That said, taking our i_mutex is closer to mkdir
-                 * emulation, and shouldn't hurt.
-                 */
-                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
                for (i = 0; group->default_groups[i]; i++) {
                        new_group = group->default_groups[i];
                        ret = create_default_group(group, new_group);
-                        if (ret)
+                        if (ret) {
+                                detach_groups(group);
                                break;
+                        }
                }
-                mutex_unlock(&dentry->d_inode->i_mutex);
        }
-        if (ret)
-                detach_groups(group);
        return ret;
 }
@@ -702,7 +744,15 @@ static int configfs_attach_item(struct config_item *parent_item,
        if (!ret) {
                ret = populate_attrs(item);
                if (ret) {
+                        /*
+                         * We are going to remove an inode and its dentry but
+                         * the VFS may already have hit and used them. Thus,
+                         * we must lock them as rmdir() would.
+                         */
+                        mutex_lock(&dentry->d_inode->i_mutex);
                        configfs_remove_dir(item);
+                        dentry->d_inode->i_flags |= S_DEAD;
+                        mutex_unlock(&dentry->d_inode->i_mutex);
                        d_delete(dentry);
                }
        }
@@ -710,6 +760,7 @@ static int configfs_attach_item(struct config_item *parent_item,
        return ret;
 }
+/* Caller holds the mutex of the item's inode */
 static void configfs_detach_item(struct config_item *item)
 {
        detach_attrs(item);
@@ -728,16 +779,30 @@ static int configfs_attach_group(struct config_item *parent_item,
                sd = dentry->d_fsdata;
                sd->s_type |= CONFIGFS_USET_DIR;
+                /*
+                 * FYI, we're faking mkdir in populate_groups()
+                 * We must lock the group's inode to avoid races with the VFS
+                 * which can already hit the inode and try to add/remove entries
+                 * under it.
+                 *
+                 * We must also lock the inode to remove it safely in case of
+                 * error, as rmdir() would.
+                 */
+                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
                ret = populate_groups(to_config_group(item));
                if (ret) {
                        configfs_detach_item(item);
-                        d_delete(dentry);
+                        dentry->d_inode->i_flags |= S_DEAD;
                }
+                mutex_unlock(&dentry->d_inode->i_mutex);
+                if (ret)
+                        d_delete(dentry);
        }
        return ret;
 }
+/* Caller holds the mutex of the group's inode */
 static void configfs_detach_group(struct config_item *item)
 {
        detach_groups(to_config_group(item));
@@ -1035,7 +1100,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct configfs_subsystem *subsys;
        struct configfs_dirent *sd;
        struct config_item_type *type;
-        struct module *owner = NULL;
+        struct module *subsys_owner = NULL, *new_item_owner = NULL;
        char *name;
        if (dentry->d_parent == configfs_sb->s_root) {
@@ -1044,6 +1109,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        }
        sd = dentry->d_parent->d_fsdata;
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         */
+        if (!configfs_dirent_is_ready(sd)) {
+                ret = -ENOENT;
+                goto out;
+        }
        if (!(sd->s_type & CONFIGFS_USET_DIR)) {
                ret = -EPERM;
                goto out;
@@ -1062,10 +1137,25 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out_put;
        }
+        /*
+         * The subsystem may belong to a different module than the item
+         * being created.  We don't want to safely pin the new item but
+         * fail to pin the subsystem it sits under.
+         */
+        if (!subsys->su_group.cg_item.ci_type) {
+                ret = -EINVAL;
+                goto out_put;
+        }
+        subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+        if (!try_module_get(subsys_owner)) {
+                ret = -EINVAL;
+                goto out_put;
+        }
        name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
        if (!name) {
                ret = -ENOMEM;
-                goto out_put;
+                goto out_subsys_put;
        }
        snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
@@ -1094,10 +1184,10 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        kfree(name);
        if (ret) {
                /*
-                 * If item == NULL, then link_obj() was never called.
+                 * If ret != 0, then link_obj() was never called.
                 * There are no extra references to clean up.
                 */
-                goto out_put;
+                goto out_subsys_put;
        }
        /*
@@ -1111,8 +1201,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                goto out_unlink;
        }
-        owner = type->ct_owner;
+        new_item_owner = type->ct_owner;
-        if (!try_module_get(owner)) {
+        if (!try_module_get(new_item_owner)) {
                ret = -EINVAL;
                goto out_unlink;
        }
@@ -1142,6 +1232,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        spin_lock(&configfs_dirent_lock);
        sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+        if (!ret)
+                configfs_dir_set_ready(dentry->d_fsdata);
        spin_unlock(&configfs_dirent_lock);
 out_unlink:
@@ -1159,9 +1251,13 @@ out_unlink:
                mutex_unlock(&subsys->su_mutex);
                if (module_got)
-                        module_put(owner);
+                        module_put(new_item_owner);
        }
+out_subsys_put:
+        if (ret)
+                module_put(subsys_owner);
 out_put:
        /*
         * link_obj()/link_group() took a reference from child->parent,
@@ -1180,7 +1276,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct config_item *item;
        struct configfs_subsystem *subsys;
        struct configfs_dirent *sd;
-        struct module *owner = NULL;
+        struct module *subsys_owner = NULL, *dead_item_owner = NULL;
        int ret;
        if (dentry->d_parent == configfs_sb->s_root)
@@ -1207,14 +1303,26 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                return -EINVAL;
        }
-        spin_lock(&configfs_dirent_lock);
+        /* configfs_mkdir() shouldn't have allowed this */
+        BUG_ON(!subsys->su_group.cg_item.ci_type);
+        subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+        /*
+         * Ensure that no racing symlink() will make detach_prep() fail while
+         * the new link is temporarily attached
+         */
        do {
                struct mutex *wait_mutex;
+                mutex_lock(&configfs_symlink_mutex);
+                spin_lock(&configfs_dirent_lock);
                ret = configfs_detach_prep(dentry, &wait_mutex);
-                if (ret) {
+                if (ret)
                        configfs_detach_rollback(dentry);
-                        spin_unlock(&configfs_dirent_lock);
+                spin_unlock(&configfs_dirent_lock);
+                mutex_unlock(&configfs_symlink_mutex);
+                if (ret) {
                        if (ret != -EAGAIN) {
                                config_item_put(parent_item);
                                return ret;
@@ -1223,11 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                        /* Wait until the racing operation terminates */
                        mutex_lock(wait_mutex);
                        mutex_unlock(wait_mutex);
-                        spin_lock(&configfs_dirent_lock);
                }
        } while (ret == -EAGAIN);
-        spin_unlock(&configfs_dirent_lock);
        /* Get a working ref for the duration of this function */
        item = configfs_get_config_item(dentry);
@@ -1236,7 +1341,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        config_item_put(parent_item);
        if (item->ci_type)
-                owner = item->ci_type->ct_owner;
+                dead_item_owner = item->ci_type->ct_owner;
        if (sd->s_type & CONFIGFS_USET_DIR) {
                configfs_detach_group(item);
@@ -1258,7 +1363,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        /* Drop our reference from above */
        config_item_put(item);
-        module_put(owner);
+        module_put(dead_item_owner);
+        module_put(subsys_owner);
        return 0;
 }
@@ -1314,13 +1420,24 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry * dentry = file->f_path.dentry;
        struct configfs_dirent * parent_sd = dentry->d_fsdata;
+        int err;
        mutex_lock(&dentry->d_inode->i_mutex);
-        file->private_data = configfs_new_dirent(parent_sd, NULL);
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         */
+        err = -ENOENT;
+        if (configfs_dirent_is_ready(parent_sd)) {
+                file->private_data = configfs_new_dirent(parent_sd, NULL);
+                if (IS_ERR(file->private_data))
+                        err = PTR_ERR(file->private_data);
+                else
+                        err = 0;
+        }
        mutex_unlock(&dentry->d_inode->i_mutex);
-        return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
+        return err;
 }
 static int configfs_dir_close(struct inode *inode, struct file *file)
@@ -1491,6 +1608,10 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
                if (err) {
                        d_delete(dentry);
                        dput(dentry);
+                } else {
+                        spin_lock(&configfs_dirent_lock);
+                        configfs_dir_set_ready(dentry->d_fsdata);
+                        spin_unlock(&configfs_dirent_lock);
                }
        }
@@ -1517,11 +1638,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
        mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
                          I_MUTEX_PARENT);
        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+        mutex_lock(&configfs_symlink_mutex);
        spin_lock(&configfs_dirent_lock);
        if (configfs_detach_prep(dentry, NULL)) {
                printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
        }
        spin_unlock(&configfs_dirent_lock);
+        mutex_unlock(&configfs_symlink_mutex);
        configfs_detach_group(&group->cg_item);
        dentry->d_inode->i_flags |= S_DEAD;
        mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0004d18c40ac..bf74973b0492 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -31,6 +31,9 @@
 #include <linux/configfs.h>
 #include "configfs_internal.h"
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
 static int item_depth(struct config_item * item)
 {
        struct config_item * p = item;
@@ -73,11 +76,20 @@ static int create_link(struct config_item *parent_item,
        struct configfs_symlink *sl;
        int ret;
+        ret = -ENOENT;
+        if (!configfs_dirent_is_ready(target_sd))
+                goto out;
        ret = -ENOMEM;
        sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
        if (sl) {
                sl->sl_target = config_item_get(item);
                spin_lock(&configfs_dirent_lock);
+                if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+                        spin_unlock(&configfs_dirent_lock);
+                        config_item_put(item);
+                        kfree(sl);
+                        return -ENOENT;
+                }
                list_add(&sl->sl_list, &target_sd->s_links);
                spin_unlock(&configfs_dirent_lock);
                ret = configfs_create_link(sl, parent_item->ci_dentry,
@@ -91,6 +103,7 @@ static int create_link(struct config_item *parent_item,
                }
        }
+out:
        return ret;
 }
@@ -120,6 +133,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 {
        int ret;
        struct nameidata nd;
+        struct configfs_dirent *sd;
        struct config_item *parent_item;
        struct config_item *target_item;
        struct config_item_type *type;
@@ -128,9 +142,19 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
        if (dentry->d_parent == configfs_sb->s_root)
                goto out;
+        sd = dentry->d_parent->d_fsdata;
+        /*
+         * Fake invisibility if dir belongs to a group/default groups hierarchy
+         * being attached
+         */
+        ret = -ENOENT;
+        if (!configfs_dirent_is_ready(sd))
+                goto out;
        parent_item = configfs_get_config_item(dentry->d_parent);
        type = parent_item->ci_type;
+        ret = -EPERM;
        if (!type || !type->ct_item_ops ||
            !type->ct_item_ops->allow_link)
                goto out_put;
@@ -141,7 +165,9 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
        ret = type->ct_item_ops->allow_link(parent_item, target_item);
        if (!ret) {
+                mutex_lock(&configfs_symlink_mutex);
                ret = create_link(parent_item, target_item, dentry);
+                mutex_unlock(&configfs_symlink_mutex);
                if (ret && type->ct_item_ops->drop_link)
                        type->ct_item_ops->drop_link(parent_item,
                                                     target_item);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c3b618c15b3..f40423eb1a14 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);
 static int cramfs_iget5_test(struct inode *inode, void *opaque)
 {
        struct cramfs_inode *cramfs_inode = opaque;
+        return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
-        if (inode->i_ino != CRAMINO(cramfs_inode))
-                return 0; /* does not match */
-        if (inode->i_ino != 1)
-                return 1;
-        /* all empty directories, char, block, pipe, and sock, share inode #1 */
-        if ((inode->i_mode != cramfs_inode->mode) ||
-            (inode->i_gid != cramfs_inode->gid) ||
-            (inode->i_uid != cramfs_inode->uid))
-                return 0; /* does not match */
-        if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
-            (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
-                return 0; /* does not match */
-        return 1; /* matches */
 }
 static int cramfs_iget5_set(struct inode *inode, void *opaque)
 {
-        static struct timespec zerotime;
        struct cramfs_inode *cramfs_inode = opaque;
-        inode->i_mode = cramfs_inode->mode;
-        inode->i_uid = cramfs_inode->uid;
-        inode->i_size = cramfs_inode->size;
-        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-        inode->i_gid = cramfs_inode->gid;
-        /* Struct copy intentional */
-        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
        inode->i_ino = CRAMINO(cramfs_inode);
-        /* inode->i_nlink is left 1 - arguably wrong for directories,
-           but it's the best we can do without reading the directory
-           contents.  1 yields the right result in GNU find, even
-           without -noleaf option. */
-        if (S_ISREG(inode->i_mode)) {
-                inode->i_fop = &generic_ro_fops;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
-                inode->i_op = &cramfs_dir_inode_operations;
-                inode->i_fop = &cramfs_directory_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
-                inode->i_op = &page_symlink_inode_operations;
-                inode->i_data.a_ops = &cramfs_aops;
-        } else {
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                init_special_inode(inode, inode->i_mode,
-                        old_decode_dev(cramfs_inode->size));
-        }
        return 0;
 }
@@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
        struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
                                            cramfs_iget5_test, cramfs_iget5_set,
                                            cramfs_inode);
+        static struct timespec zerotime;
        if (inode && (inode->i_state & I_NEW)) {
+                inode->i_mode = cramfs_inode->mode;
+                inode->i_uid = cramfs_inode->uid;
+                inode->i_size = cramfs_inode->size;
+                inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+                inode->i_gid = cramfs_inode->gid;
+                /* Struct copy intentional */
+                inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+                /* inode->i_nlink is left 1 - arguably wrong for directories,
+                   but it's the best we can do without reading the directory
+                   contents.  1 yields the right result in GNU find, even
+                   without -noleaf option. */
+                if (S_ISREG(inode->i_mode)) {
+                        inode->i_fop = &generic_ro_fops;
+                        inode->i_data.a_ops = &cramfs_aops;
+                } else if (S_ISDIR(inode->i_mode)) {
+                        inode->i_op = &cramfs_dir_inode_operations;
+                        inode->i_fop = &cramfs_directory_operations;
+                } else if (S_ISLNK(inode->i_mode)) {
+                        inode->i_op = &page_symlink_inode_operations;
+                        inode->i_data.a_ops = &cramfs_aops;
+                } else {
+                        inode->i_size = 0;
+                        inode->i_blocks = 0;
+                        init_special_inode(inode, inode->i_mode,
+                                old_decode_dev(cramfs_inode->size));
+                }
                unlock_new_inode(inode);
        }
        return inode;
 }
+static void cramfs_drop_inode(struct inode *inode)
+{
+        if (inode->i_ino == 1)
+                generic_delete_inode(inode);
+        else
+                generic_drop_inode(inode);
+}
 /*
 * We have our own block cache: don't fill up the buffer cache
 * with the rom-image, because the way the filesystem is set
@@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {
        .put_super      = cramfs_put_super,
        .remount_fs     = cramfs_remount,
        .statfs         = cramfs_statfs,
+        .drop_inode     = cramfs_drop_inode,
 };
 static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index 3818d6ab76ca..80e93956aced 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -487,6 +487,7 @@ restart:
                                if (!cnt)
                                        break;
                        }
+                        cond_resched_lock(&dcache_lock);
                }
        }
        while (!list_empty(&tmp)) {
@@ -1219,6 +1220,107 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
        return new;
 }
+/**
+ * d_add_ci - lookup or allocate new dentry with case-exact name
+ * @inode:  the inode case-insensitive lookup has found
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * This is to avoid filling the dcache with case-insensitive names to the
+ * same inode, only the actual correct case is stored in the dcache for
+ * case-insensitive filesystems.
+ *
+ * For a case-insensitive lookup match and if the the case-exact dentry
+ * already exists in in the dcache, use it and return it.
+ *
+ * If no entry exists with the exact case name, allocate new dentry with
+ * the exact case, and return the spliced entry.
+ */
+struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
+                        struct qstr *name)
+{
+        int error;
+        struct dentry *found;
+        struct dentry *new;
+        /* Does a dentry matching the name exist already? */
+        found = d_hash_and_lookup(dentry->d_parent, name);
+        /* If not, create it now and return */
+        if (!found) {
+                new = d_alloc(dentry->d_parent, name);
+                if (!new) {
+                        error = -ENOMEM;
+                        goto err_out;
+                }
+                found = d_splice_alias(inode, new);
+                if (found) {
+                        dput(new);
+                        return found;
+                }
+                return new;
+        }
+        /* Matching dentry exists, check if it is negative. */
+        if (found->d_inode) {
+                if (unlikely(found->d_inode != inode)) {
+                        /* This can't happen because bad inodes are unhashed. */
+                        BUG_ON(!is_bad_inode(inode));
+                        BUG_ON(!is_bad_inode(found->d_inode));
+                }
+                /*
+                 * Already have the inode and the dentry attached, decrement
+                 * the reference count to balance the iget() done
+                 * earlier on.  We found the dentry using d_lookup() so it
+                 * cannot be disconnected and thus we do not need to worry
+                 * about any NFS/disconnectedness issues here.
+                 */
+                iput(inode);
+                return found;
+        }
+        /*
+         * Negative dentry: instantiate it unless the inode is a directory and
+         * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
+         * in which case d_move() that in place of the found dentry.
+         */
+        if (!S_ISDIR(inode->i_mode)) {
+                /* Not a directory; everything is easy. */
+                d_instantiate(found, inode);
+                return found;
+        }
+        spin_lock(&dcache_lock);
+        if (list_empty(&inode->i_dentry)) {
+                /*
+                 * Directory without a 'disconnected' dentry; we need to do
+                 * d_instantiate() by hand because it takes dcache_lock which
+                 * we already hold.
+                 */
+                list_add(&found->d_alias, &inode->i_dentry);
+                found->d_inode = inode;
+                spin_unlock(&dcache_lock);
+                security_d_instantiate(found, inode);
+                return found;
+        }
+        /*
+         * Directory with a 'disconnected' dentry; get a reference to the
+         * 'disconnected' dentry.
+         */
+        new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+        dget_locked(new);
+        spin_unlock(&dcache_lock);
+        /* Do security vodoo. */
+        security_d_instantiate(found, inode);
+        /* Move new in place of found. */
+        d_move(new, found);
+        /* Balance the iget() we did above. */
+        iput(inode);
+        /* Throw away found. */
+        dput(found);
+        /* Use new as the actual dentry. */
+        return new;
+err_out:
+        iput(inode);
+        return ERR_PTR(error);
+}
 /**
 * d_lookup - search for a dentry
@@ -2253,6 +2355,7 @@ EXPORT_SYMBOL(d_path);
 EXPORT_SYMBOL(d_prune_aliases);
 EXPORT_SYMBOL(d_rehash);
 EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_add_ci);
 EXPORT_SYMBOL(d_validate);
 EXPORT_SYMBOL(dget_locked);
 EXPORT_SYMBOL(dput);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 285b64a8b06e..488eb424f662 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -29,7 +29,7 @@
 #define DEVPTS_DEFAULT_MODE 0600
 extern int pty_limit;                   /* Config limit on Unix98 ptys */
-static DEFINE_IDR(allocated_ptys);
+static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 static struct vfsmount *devpts_mnt;
@@ -180,24 +180,24 @@ static struct dentry *get_node(int num)
 int devpts_new_index(void)
 {
        int index;
-        int idr_ret;
+        int ida_ret;
 retry:
-        if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) {
+        if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
                return -ENOMEM;
        }
        mutex_lock(&allocated_ptys_lock);
-        idr_ret = idr_get_new(&allocated_ptys, NULL, &index);
+        ida_ret = ida_get_new(&allocated_ptys, &index);
-        if (idr_ret < 0) {
+        if (ida_ret < 0) {
                mutex_unlock(&allocated_ptys_lock);
-                if (idr_ret == -EAGAIN)
+                if (ida_ret == -EAGAIN)
                        goto retry;
                return -EIO;
        }
        if (index >= pty_limit) {
-                idr_remove(&allocated_ptys, index);
+                ida_remove(&allocated_ptys, index);
                mutex_unlock(&allocated_ptys_lock);
                return -EIO;
        }
@@ -208,7 +208,7 @@ retry:
 void devpts_kill_index(int idx)
 {
        mutex_lock(&allocated_ptys_lock);
-        idr_remove(&allocated_ptys, idx);
+        ida_remove(&allocated_ptys, idx);
        mutex_unlock(&allocated_ptys_lock);
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9e81addbd6ea..9606ee848fd8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *dio)
        int nr_pages;
        nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-        down_read(&current->mm->mmap_sem);
+        ret = get_user_pages_fast(
-        ret = get_user_pages(
-                current,                        /* Task for fault acounting */
-                current->mm,                    /* whose pages? */
                dio->curr_user_address,         /* Where from? */
                nr_pages,                       /* How many pages? */
                dio->rw == READ,                /* Write to memory? */
-                0,                              /* force (?) */
+                &dio->pages[0]);                /* Put results here */
-                &dio->pages[0],
-                NULL);                          /* vmas */
-        up_read(&current->mm->mmap_sem);
        if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
                struct page *page = ZERO_PAGE(0);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c4e7d721bd8d..89d2fb7b991a 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -30,16 +30,16 @@
 static struct config_group *space_list;
 static struct config_group *comm_list;
-static struct comm *local_comm;
+static struct dlm_comm *local_comm;
-struct clusters;
+struct dlm_clusters;
-struct cluster;
+struct dlm_cluster;
-struct spaces;
+struct dlm_spaces;
-struct space;
+struct dlm_space;
-struct comms;
+struct dlm_comms;
-struct comm;
+struct dlm_comm;
-struct nodes;
+struct dlm_nodes;
-struct node;
+struct dlm_node;
 static struct config_group *make_cluster(struct config_group *, const char *);
 static void drop_cluster(struct config_group *, struct config_item *);
@@ -68,17 +68,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len);
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
-static ssize_t comm_local_read(struct comm *cm, char *buf);
+                                size_t len);
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
-static ssize_t node_nodeid_read(struct node *nd, char *buf);
+                                size_t len);
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
-static ssize_t node_weight_read(struct node *nd, char *buf);
+                                size_t len);
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
-struct cluster {
+                                size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+                                size_t len);
+struct dlm_cluster {
        struct config_group group;
        unsigned int cl_tcp_port;
        unsigned int cl_buffer_size;
@@ -109,11 +114,11 @@ enum {
 struct cluster_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct cluster *, char *);
+        ssize_t (*show)(struct dlm_cluster *, char *);
-        ssize_t (*store)(struct cluster *, const char *, size_t);
+        ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
 };
-static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
                           int *info_field, int check_zero,
                           const char *buf, size_t len)
 {
@@ -134,12 +139,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
 }
 #define CLUSTER_ATTR(name, check_zero)                                        \
-static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
 {                                                                             \
        return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
                           check_zero, buf, len);                             \
 }                                                                             \
-static ssize_t name##_read(struct cluster *cl, char *buf)                     \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
 {                                                                             \
        return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
 }                                                                             \
@@ -181,8 +186,8 @@ enum {
 struct comm_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct comm *, char *);
+        ssize_t (*show)(struct dlm_comm *, char *);
-        ssize_t (*store)(struct comm *, const char *, size_t);
+        ssize_t (*store)(struct dlm_comm *, const char *, size_t);
 };
 static struct comm_attribute comm_attr_nodeid = {
@@ -222,8 +227,8 @@ enum {
 struct node_attribute {
        struct configfs_attribute attr;
-        ssize_t (*show)(struct node *, char *);
+        ssize_t (*show)(struct dlm_node *, char *);
-        ssize_t (*store)(struct node *, const char *, size_t);
+        ssize_t (*store)(struct dlm_node *, const char *, size_t);
 };
 static struct node_attribute node_attr_nodeid = {
@@ -248,26 +253,26 @@ static struct configfs_attribute *node_attrs[] = {
        NULL,
 };
-struct clusters {
+struct dlm_clusters {
        struct configfs_subsystem subsys;
 };
-struct spaces {
+struct dlm_spaces {
        struct config_group ss_group;
 };
-struct space {
+struct dlm_space {
        struct config_group group;
        struct list_head members;
        struct mutex members_lock;
        int members_count;
 };
-struct comms {
+struct dlm_comms {
        struct config_group cs_group;
 };
-struct comm {
+struct dlm_comm {
        struct config_item item;
        int nodeid;
        int local;
@@ -275,11 +280,11 @@ struct comm {
        struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
-struct nodes {
+struct dlm_nodes {
        struct config_group ns_group;
 };
-struct node {
+struct dlm_node {
        struct config_item item;
        struct list_head list; /* space->members */
        int nodeid;
@@ -372,38 +377,40 @@ static struct config_item_type node_type = {
        .ct_owner = THIS_MODULE,
 };
-static struct cluster *to_cluster(struct config_item *i)
+static struct dlm_cluster *to_cluster(struct config_item *i)
 {
-        return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+        return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+                   NULL;
 }
-static struct space *to_space(struct config_item *i)
+static struct dlm_space *to_space(struct config_item *i)
 {
-        return i ? container_of(to_config_group(i), struct space, group) : NULL;
+        return i ? container_of(to_config_group(i), struct dlm_space, group) :
+                   NULL;
 }
-static struct comm *to_comm(struct config_item *i)
+static struct dlm_comm *to_comm(struct config_item *i)
 {
-        return i ? container_of(i, struct comm, item) : NULL;
+        return i ? container_of(i, struct dlm_comm, item) : NULL;
 }
-static struct node *to_node(struct config_item *i)
+static struct dlm_node *to_node(struct config_item *i)
 {
-        return i ? container_of(i, struct node, item) : NULL;
+        return i ? container_of(i, struct dlm_node, item) : NULL;
 }
 static struct config_group *make_cluster(struct config_group *g,
                                         const char *name)
 {
-        struct cluster *cl = NULL;
+        struct dlm_cluster *cl = NULL;
-        struct spaces *sps = NULL;
+        struct dlm_spaces *sps = NULL;
-        struct comms *cms = NULL;
+        struct dlm_comms *cms = NULL;
        void *gps = NULL;
-        cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+        cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
        gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-        sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+        sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
-        cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+        cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
        if (!cl || !gps || !sps || !cms)
                goto fail;
@@ -443,7 +450,7 @@ static struct config_group *make_cluster(struct config_group *g,
 static void drop_cluster(struct config_group *g, struct config_item *i)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        struct config_item *tmp;
        int j;
@@ -461,20 +468,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
 static void release_cluster(struct config_item *i)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        kfree(cl->group.default_groups);
        kfree(cl);
 }
 static struct config_group *make_space(struct config_group *g, const char *name)
 {
-        struct space *sp = NULL;
+        struct dlm_space *sp = NULL;
-        struct nodes *nds = NULL;
+        struct dlm_nodes *nds = NULL;
        void *gps = NULL;
-        sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+        sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
        gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
-        nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+        nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
        if (!sp || !gps || !nds)
                goto fail;
@@ -500,7 +507,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 static void drop_space(struct config_group *g, struct config_item *i)
 {
-        struct space *sp = to_space(i);
+        struct dlm_space *sp = to_space(i);
        struct config_item *tmp;
        int j;
@@ -517,16 +524,16 @@ static void drop_space(struct config_group *g, struct config_item *i)
 static void release_space(struct config_item *i)
 {
-        struct space *sp = to_space(i);
+        struct dlm_space *sp = to_space(i);
        kfree(sp->group.default_groups);
        kfree(sp);
 }
 static struct config_item *make_comm(struct config_group *g, const char *name)
 {
-        struct comm *cm;
+        struct dlm_comm *cm;
-        cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+        cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
        if (!cm)
                return ERR_PTR(-ENOMEM);
@@ -539,7 +546,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 static void drop_comm(struct config_group *g, struct config_item *i)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        if (local_comm == cm)
                local_comm = NULL;
        dlm_lowcomms_close(cm->nodeid);
@@ -550,16 +557,16 @@ static void drop_comm(struct config_group *g, struct config_item *i)
 static void release_comm(struct config_item *i)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        kfree(cm);
 }
 static struct config_item *make_node(struct config_group *g, const char *name)
 {
-        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct dlm_space *sp = to_space(g->cg_item.ci_parent);
-        struct node *nd;
+        struct dlm_node *nd;
-        nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+        nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
        if (!nd)
                return ERR_PTR(-ENOMEM);
@@ -578,8 +585,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 static void drop_node(struct config_group *g, struct config_item *i)
 {
-        struct space *sp = to_space(g->cg_item.ci_parent);
+        struct dlm_space *sp = to_space(g->cg_item.ci_parent);
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        mutex_lock(&sp->members_lock);
        list_del(&nd->list);
@@ -591,11 +598,11 @@ static void drop_node(struct config_group *g, struct config_item *i)
 static void release_node(struct config_item *i)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        kfree(nd);
 }
-static struct clusters clusters_root = {
+static struct dlm_clusters clusters_root = {
        .subsys = {
                .su_group = {
                        .cg_item = {
@@ -625,7 +632,7 @@ void dlm_config_exit(void)
 static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
                            char *buf)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        struct cluster_attribute *cla =
                        container_of(a, struct cluster_attribute, attr);
        return cla->show ? cla->show(cl, buf) : 0;
@@ -635,7 +642,7 @@ static ssize_t store_cluster(struct config_item *i,
                             struct configfs_attribute *a,
                             const char *buf, size_t len)
 {
-        struct cluster *cl = to_cluster(i);
+        struct dlm_cluster *cl = to_cluster(i);
        struct cluster_attribute *cla =
                container_of(a, struct cluster_attribute, attr);
        return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -644,7 +651,7 @@ static ssize_t store_cluster(struct config_item *i,
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        struct comm_attribute *cma =
                        container_of(a, struct comm_attribute, attr);
        return cma->show ? cma->show(cm, buf) : 0;
@@ -653,29 +660,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len)
 {
-        struct comm *cm = to_comm(i);
+        struct dlm_comm *cm = to_comm(i);
        struct comm_attribute *cma =
                container_of(a, struct comm_attribute, attr);
        return cma->store ? cma->store(cm, buf, len) : -EINVAL;
 }
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
 {
        return sprintf(buf, "%d\n", cm->nodeid);
 }
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+                                 size_t len)
 {
        cm->nodeid = simple_strtol(buf, NULL, 0);
        return len;
 }
-static ssize_t comm_local_read(struct comm *cm, char *buf)
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
 {
        return sprintf(buf, "%d\n", cm->local);
 }
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+                                size_t len)
 {
        cm->local= simple_strtol(buf, NULL, 0);
        if (cm->local && !local_comm)
@@ -683,7 +692,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
        return len;
 }
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 {
        struct sockaddr_storage *addr;
@@ -705,7 +714,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
 static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
                         char *buf)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        struct node_attribute *nda =
                        container_of(a, struct node_attribute, attr);
        return nda->show ? nda->show(nd, buf) : 0;
@@ -714,29 +723,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
                          const char *buf, size_t len)
 {
-        struct node *nd = to_node(i);
+        struct dlm_node *nd = to_node(i);
        struct node_attribute *nda =
                container_of(a, struct node_attribute, attr);
        return nda->store ? nda->store(nd, buf, len) : -EINVAL;
 }
-static ssize_t node_nodeid_read(struct node *nd, char *buf)
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
 {
        return sprintf(buf, "%d\n", nd->nodeid);
 }
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+                                 size_t len)
 {
        nd->nodeid = simple_strtol(buf, NULL, 0);
        return len;
 }
-static ssize_t node_weight_read(struct node *nd, char *buf)
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
 {
        return sprintf(buf, "%d\n", nd->weight);
 }
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+                                 size_t len)
 {
        nd->weight = simple_strtol(buf, NULL, 0);
        return len;
@@ -746,7 +757,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
 * Functions for the dlm to get the info that's been configured
 */
-static struct space *get_space(char *name)
+static struct dlm_space *get_space(char *name)
 {
        struct config_item *i;
@@ -760,15 +771,15 @@ static struct space *get_space(char *name)
        return to_space(i);
 }
-static void put_space(struct space *sp)
+static void put_space(struct dlm_space *sp)
 {
        config_item_put(&sp->group.cg_item);
 }
-static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 {
        struct config_item *i;
-        struct comm *cm = NULL;
+        struct dlm_comm *cm = NULL;
        int found = 0;
        if (!comm_list)
@@ -801,7 +812,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
        return cm;
 }
-static void put_comm(struct comm *cm)
+static void put_comm(struct dlm_comm *cm)
 {
        config_item_put(&cm->item);
 }
@@ -810,8 +821,8 @@ static void put_comm(struct comm *cm)
 int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
                    int **new_out, int *new_count_out)
 {
-        struct space *sp;
+        struct dlm_space *sp;
-        struct node *nd;
+        struct dlm_node *nd;
        int i = 0, rv = 0, ids_count = 0, new_count = 0;
        int *ids, *new;
@@ -874,8 +885,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 int dlm_node_weight(char *lsname, int nodeid)
 {
-        struct space *sp;
+        struct dlm_space *sp;
-        struct node *nd;
+        struct dlm_node *nd;
        int w = -EEXIST;
        sp = get_space(lsname);
@@ -897,7 +908,7 @@ int dlm_node_weight(char *lsname, int nodeid)
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 {
-        struct comm *cm = get_comm(nodeid, NULL);
+        struct dlm_comm *cm = get_comm(nodeid, NULL);
        if (!cm)
                return -EEXIST;
        if (!cm->addr_count)
@@ -909,7 +920,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
-        struct comm *cm = get_comm(0, addr);
+        struct dlm_comm *cm = get_comm(0, addr);
        if (!cm)
                return -EEXIST;
        *nodeid = cm->nodeid;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2d3d1027ce2b..724ddac91538 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -363,6 +363,7 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
                if (len == r->res_length && !memcmp(name, r->res_name, len))
                        goto found;
        }
+        *r_ret = NULL;
        return -EBADR;
 found:
@@ -1782,7 +1783,8 @@ static void grant_pending_locks(struct dlm_rsb *r)
        list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
                if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
-                        if (cw && high == DLM_LOCK_PR)
+                        if (cw && high == DLM_LOCK_PR &&
+                            lkb->lkb_grmode == DLM_LOCK_PR)
                                queue_bast(r, lkb, DLM_LOCK_CW);
                        else
                                queue_bast(r, lkb, high);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 637018c891ef..3962262f991a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -891,8 +891,10 @@ static void tcp_connect_to_sock(struct connection *con)
                goto out_err;
        memset(&saddr, 0, sizeof(saddr));
-        if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+        if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
+                sock_release(sock);
                goto out_err;
+        }
        sock->sk->sk_user_data = con;
        con->rx_action = receive_from_sock;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 78878c5781ca..eba87ff3177b 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -116,7 +116,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
        if (xop->callback == NULL)
                wait_event(recv_wq, (op->done != 0));
        else {
-                rv = -EINPROGRESS;
+                rv = FILE_LOCK_DEFERRED;
                goto out;
        }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index f976f303c196..34f14a14fb4e 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -527,8 +527,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
                k32buf = (struct dlm_write_request32 *)kbuf;
                kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
                               sizeof(struct dlm_write_request32)), GFP_KERNEL);
-                if (!kbuf)
+                if (!kbuf) {
+                        kfree(k32buf);
                        return -ENOMEM;
+                }
                if (proc)
                        set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
@@ -539,8 +541,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
        /* do we really need this? can a write happen after a close? */
        if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
-            test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+            (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
-                return -EINVAL;
+                error = -EINVAL;
+                goto out_free;
+        }
        sigfillset(&allsigs);
        sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
diff --git a/fs/dquot.c b/fs/dquot.c
index 5ac77da19959..8ec4d6cc7633 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -562,6 +562,8 @@ static struct shrinker dqcache_shrinker = {
 */
 static void dqput(struct dquot *dquot)
 {
+        int ret;
        if (!dquot)
                return;
 #ifdef __DQUOT_PARANOIA
@@ -594,7 +596,19 @@ we_slept:
        if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
                spin_unlock(&dq_list_lock);
                /* Commit dquot before releasing */
-                dquot->dq_sb->dq_op->write_dquot(dquot);
+                ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: cannot write quota structure on "
+                                "device %s (error %d). Quota may get out of "
+                                "sync!\n", dquot->dq_sb->s_id, ret);
+                        /*
+                         * We clear dirty bit anyway, so that we avoid
+                         * infinite loop here
+                         */
+                        spin_lock(&dq_list_lock);
+                        clear_dquot_dirty(dquot);
+                        spin_unlock(&dq_list_lock);
+                }
                goto we_slept;
        }
        /* Clear flag in case dquot was inactive (something bad happened) */
@@ -875,7 +889,10 @@ static void print_warning(struct dquot *dquot, const int warntype)
        char *msg = NULL;
        struct tty_struct *tty;
-        if (!need_print_warning(dquot))
+        if (warntype == QUOTA_NL_IHARDBELOW ||
+            warntype == QUOTA_NL_ISOFTBELOW ||
+            warntype == QUOTA_NL_BHARDBELOW ||
+            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
                return;
        mutex_lock(&tty_mutex);
@@ -1083,6 +1100,35 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        return QUOTA_OK;
 }
+static int info_idq_free(struct dquot *dquot, ulong inodes)
+{
+        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+                return QUOTA_NL_NOWARN;
+        if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
+                return QUOTA_NL_ISOFTBELOW;
+        if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
+            dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit)
+                return QUOTA_NL_IHARDBELOW;
+        return QUOTA_NL_NOWARN;
+}
+static int info_bdq_free(struct dquot *dquot, qsize_t space)
+{
+        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+            toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+                return QUOTA_NL_NOWARN;
+        if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+            dquot->dq_dqb.dqb_bsoftlimit)
+                return QUOTA_NL_BSOFTBELOW;
+        if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+            toqb(dquot->dq_dqb.dqb_curspace - space) <
+                                                dquot->dq_dqb.dqb_bhardlimit)
+                return QUOTA_NL_BHARDBELOW;
+        return QUOTA_NL_NOWARN;
+}
 /*
 *      Initialize quota pointers in inode
 *      Transaction must be started at entry
@@ -1139,6 +1185,28 @@ int dquot_drop(struct inode *inode)
        return 0;
 }
+/* Wrapper to remove references to quota structures from inode */
+void vfs_dq_drop(struct inode *inode)
+{
+        /* Here we can get arbitrary inode from clear_inode() so we have
+         * to be careful. OTOH we don't need locking as quota operations
+         * are allowed to change only at mount time */
+        if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
+            && inode->i_sb->dq_op->drop) {
+                int cnt;
+                /* Test before calling to rule out calls from proc and such
+                 * where we are not allowed to block. Note that this is
+                 * actually reliable test even without the lock - the caller
+                 * must assure that nobody can come after the DQUOT_DROP and
+                 * add quota pointers back anyway */
+                for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                        if (inode->i_dquot[cnt] != NODQUOT)
+                                break;
+                if (cnt < MAXQUOTAS)
+                        inode->i_sb->dq_op->drop(inode);
+        }
+}
 /*
 * Following four functions update i_blocks+i_bytes fields and
 * quota information (together with appropriate checks)
@@ -1248,6 +1316,7 @@ warn_put_all:
 int dquot_free_space(struct inode *inode, qsize_t number)
 {
        unsigned int cnt;
+        char warntype[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
@@ -1256,6 +1325,7 @@ out_sub:
                inode_sub_bytes(inode, number);
                return QUOTA_OK;
        }
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        /* Now recheck reliably when holding dqptr_sem */
        if (IS_NOQUOTA(inode)) {
@@ -1266,6 +1336,7 @@ out_sub:
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (inode->i_dquot[cnt] == NODQUOT)
                        continue;
+                warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
                dquot_decr_space(inode->i_dquot[cnt], number);
        }
        inode_sub_bytes(inode, number);
@@ -1274,6 +1345,7 @@ out_sub:
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (inode->i_dquot[cnt])
                        mark_dquot_dirty(inode->i_dquot[cnt]);
+        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return QUOTA_OK;
 }
@@ -1284,11 +1356,13 @@ out_sub:
 int dquot_free_inode(const struct inode *inode, unsigned long number)
 {
        unsigned int cnt;
+        char warntype[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
                return QUOTA_OK;
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        /* Now recheck reliably when holding dqptr_sem */
        if (IS_NOQUOTA(inode)) {
@@ -1299,6 +1373,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (inode->i_dquot[cnt] == NODQUOT)
                        continue;
+                warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
                dquot_decr_inodes(inode->i_dquot[cnt], number);
        }
        spin_unlock(&dq_data_lock);
@@ -1306,6 +1381,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (inode->i_dquot[cnt])
                        mark_dquot_dirty(inode->i_dquot[cnt]);
+        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return QUOTA_OK;
 }
@@ -1323,7 +1399,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        struct dquot *transfer_to[MAXQUOTAS];
        int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
            chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
-        char warntype[MAXQUOTAS];
+        char warntype_to[MAXQUOTAS];
+        char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
@@ -1332,7 +1409,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        /* Clear the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
-                warntype[cnt] = QUOTA_NL_NOWARN;
+                warntype_to[cnt] = QUOTA_NL_NOWARN;
        }
        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        /* Now recheck reliably when holding dqptr_sem */
@@ -1364,8 +1441,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
                if (transfer_to[cnt] == NODQUOT)
                        continue;
                transfer_from[cnt] = inode->i_dquot[cnt];
-                if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA ||
+                if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
-                    check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA)
+                    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
+                    warntype_to + cnt) == NO_QUOTA)
                        goto warn_put_all;
        }
@@ -1381,6 +1459,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
+                        warntype_from_inodes[cnt] =
+                                info_idq_free(transfer_from[cnt], 1);
+                        warntype_from_space[cnt] =
+                                info_bdq_free(transfer_from[cnt], space);
                        dquot_decr_inodes(transfer_from[cnt], 1);
                        dquot_decr_space(transfer_from[cnt], space);
                }
@@ -1400,7 +1482,9 @@ warn_put_all:
                if (transfer_to[cnt])
                        mark_dquot_dirty(transfer_to[cnt]);
        }
-        flush_warnings(transfer_to, warntype);
+        flush_warnings(transfer_to, warntype_to);
+        flush_warnings(transfer_from, warntype_from_inodes);
+        flush_warnings(transfer_from, warntype_from_space);
        
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
@@ -1412,6 +1496,18 @@ warn_put_all:
        return ret;
 }
+/* Wrapper for transferring ownership of an inode */
+int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+{
+        if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+                vfs_dq_init(inode);
+                if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
+                        return 1;
+        }
+        return 0;
+}
 /*
 * Write info of quota file to disk
 */
@@ -1697,6 +1793,21 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
        return ret;
 }
+int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+                      struct path *path)
+{
+        int error = security_quota_on(path->dentry);
+        if (error)
+                return error;
+        /* Quota file not on the same filesystem? */
+        if (path->mnt->mnt_sb != sb)
+                error = -EXDEV;
+        else
+                error = vfs_quota_on_inode(path->dentry->d_inode, type,
+                                           format_id);
+        return error;
+}
 /* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path,
                 int remount)
@@ -1708,19 +1819,10 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path,
                return vfs_quota_on_remount(sb, type);
        error = path_lookup(path, LOOKUP_FOLLOW, &nd);
-        if (error < 0)
+        if (!error) {
-                return error;
+                error = vfs_quota_on_path(sb, type, format_id, &nd.path);
-        error = security_quota_on(nd.path.dentry);
+                path_put(&nd.path);
-        if (error)
+        }
-                goto out_path;
-        /* Quota file not on the same filesystem? */
-        if (nd.path.mnt->mnt_sb != sb)
-                error = -EXDEV;
-        else
-                error = vfs_quota_on_inode(nd.path.dentry->d_inode, type,
-                                           format_id);
-out_path:
-        path_put(&nd.path);
        return error;
 }
@@ -1752,6 +1854,22 @@ out:
        return error;
 }
+/* Wrapper to turn on quotas when remounting rw */
+int vfs_dq_quota_on_remount(struct super_block *sb)
+{
+        int cnt;
+        int ret = 0, err;
+        if (!sb->s_qcop || !sb->s_qcop->quota_on)
+                return -ENOSYS;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
+                if (err < 0 && !ret)
+                        ret = err;
+        }
+        return ret;
+}
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
@@ -2073,6 +2191,7 @@ EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
 EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
 EXPORT_SYMBOL(vfs_quota_off);
 EXPORT_SYMBOL(vfs_quota_sync);
@@ -2087,8 +2206,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(vfs_dq_drop);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
 EXPORT_SYMBOL(dquot_free_inode);
 EXPORT_SYMBOL(dquot_transfer);
+EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7b99917ffadc..06db79d05c12 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -475,8 +475,8 @@ int ecryptfs_encrypt_page(struct page *page)
 {
        struct inode *ecryptfs_inode;
        struct ecryptfs_crypt_stat *crypt_stat;
-        char *enc_extent_virt = NULL;
+        char *enc_extent_virt;
-        struct page *enc_extent_page;
+        struct page *enc_extent_page = NULL;
        loff_t extent_offset;
        int rc = 0;
@@ -492,14 +492,14 @@ int ecryptfs_encrypt_page(struct page *page)
                               page->index);
                goto out;
        }
-        enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
+        enc_extent_page = alloc_page(GFP_USER);
-        if (!enc_extent_virt) {
+        if (!enc_extent_page) {
                rc = -ENOMEM;
                ecryptfs_printk(KERN_ERR, "Error allocating memory for "
                                "encrypted extent\n");
                goto out;
        }
-        enc_extent_page = virt_to_page(enc_extent_virt);
+        enc_extent_virt = kmap(enc_extent_page);
        for (extent_offset = 0;
             extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
             extent_offset++) {
@@ -527,7 +527,10 @@ int ecryptfs_encrypt_page(struct page *page)
                }
        }
 out:
-        kfree(enc_extent_virt);
+        if (enc_extent_page) {
+                kunmap(enc_extent_page);
+                __free_page(enc_extent_page);
+        }
        return rc;
 }
@@ -609,8 +612,8 @@ int ecryptfs_decrypt_page(struct page *page)
 {
        struct inode *ecryptfs_inode;
        struct ecryptfs_crypt_stat *crypt_stat;
-        char *enc_extent_virt = NULL;
+        char *enc_extent_virt;
-        struct page *enc_extent_page;
+        struct page *enc_extent_page = NULL;
        unsigned long extent_offset;
        int rc = 0;
@@ -627,14 +630,14 @@ int ecryptfs_decrypt_page(struct page *page)
                               page->index);
                goto out;
        }
-        enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
+        enc_extent_page = alloc_page(GFP_USER);
-        if (!enc_extent_virt) {
+        if (!enc_extent_page) {
                rc = -ENOMEM;
                ecryptfs_printk(KERN_ERR, "Error allocating memory for "
                                "encrypted extent\n");
                goto out;
        }
-        enc_extent_page = virt_to_page(enc_extent_virt);
+        enc_extent_virt = kmap(enc_extent_page);
        for (extent_offset = 0;
             extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
             extent_offset++) {
@@ -662,7 +665,10 @@ int ecryptfs_decrypt_page(struct page *page)
                }
        }
 out:
-        kfree(enc_extent_virt);
+        if (enc_extent_page) {
+                kunmap(enc_extent_page);
+                __free_page(enc_extent_page);
+        }
        return rc;
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d755455e3bff..89209f00f9c7 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -465,7 +465,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        int rc;
        struct dentry *lower_dentry;
        struct dentry *lower_dir_dentry;
-        umode_t mode;
        char *encoded_symname;
        int encoded_symlen;
        struct ecryptfs_crypt_stat *crypt_stat = NULL;
@@ -473,7 +472,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(lower_dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
-        mode = S_IALLUGO;
        encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
                                                  strlen(symname),
                                                  &encoded_symname);
@@ -482,7 +480,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_lock;
        }
        rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
-                         encoded_symname, mode);
+                         encoded_symname);
        kfree(encoded_symname);
        if (rc || !lower_dentry->d_inode)
                goto out_lock;
@@ -830,22 +828,9 @@ out:
 }
 static int
-ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+ecryptfs_permission(struct inode *inode, int mask)
 {
-        int rc;
+        return inode_permission(ecryptfs_inode_to_lower(inode), mask);
-        if (nd) {
-                struct vfsmount *vfsmnt_save = nd->path.mnt;
-                struct dentry *dentry_save = nd->path.dentry;
-                nd->path.mnt = ecryptfs_dentry_to_lower_mnt(nd->path.dentry);
-                nd->path.dentry = ecryptfs_dentry_to_lower(nd->path.dentry);
-                rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
-                nd->path.mnt = vfsmnt_save;
-                nd->path.dentry = dentry_save;
-        } else
-                rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
-        return rc;
 }
 /**
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6f403cfba14f..448dfd597b5f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -578,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = {
 * Initializes the ecryptfs_inode_info_cache when it is created
 */
 static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
 {
        struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
@@ -589,7 +589,7 @@ static struct ecryptfs_cache_info {
        struct kmem_cache **cache;
        const char *name;
        size_t size;
-        void (*ctor)(struct kmem_cache *cache, void *obj);
+        void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
        {
                .cache = &ecryptfs_auth_tok_list_item_cache,
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 3a404e7fad53..291abb11e20e 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
        }
        unlock_kernel();
-        d_add(dentry, inode);
+        return d_splice_alias(inode, dentry);
-        return NULL;
 }
 static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e2..567b134fa1f1 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
        kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct efs_inode_info *ei = (struct efs_inode_info *) foo;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0c87474f7917..7cc0eb756b55 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1041,10 +1041,7 @@ retry:
 }
 /*
- * It opens an eventpoll file descriptor. The "size" parameter is there
+ * Open an eventpoll file descriptor.
- * for historical reasons, when epoll was using an hash instead of an
- * RB tree. With the current implementation, the "size" parameter is ignored
- * (besides sanity checks).
 */
 asmlinkage long sys_epoll_create1(int flags)
 {
diff --git a/fs/exec.c b/fs/exec.c
index 190ed1f92774..32993beecbe9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -25,10 +25,11 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/mman.h>
+#include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
+#include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
@@ -37,20 +38,18 @@
 #include <linux/key.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
-#include <linux/swap.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
 #include <linux/namei.h>
 #include <linux/proc_fs.h>
-#include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/rmap.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -108,11 +107,17 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 */
 asmlinkage long sys_uselib(const char __user * library)
 {
-        struct file * file;
+        struct file *file;
        struct nameidata nd;
-        int error;
+        char *tmp = getname(library);
+        int error = PTR_ERR(tmp);
-        error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
+        if (!IS_ERR(tmp)) {
+                error = path_lookup_open(AT_FDCWD, tmp,
+                                         LOOKUP_FOLLOW, &nd,
+                                         FMODE_READ|FMODE_EXEC);
+                putname(tmp);
+        }
        if (error)
                goto out;
@@ -120,7 +125,11 @@ asmlinkage long sys_uselib(const char __user * library)
        if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
                goto exit;
-        error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
+        error = -EACCES;
+        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+                goto exit;
+        error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
        if (error)
                goto exit;
@@ -658,38 +667,43 @@ EXPORT_SYMBOL(setup_arg_pages);
 struct file *open_exec(const char *name)
 {
        struct nameidata nd;
-        int err;
        struct file *file;
+        int err;
-        err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
+        err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
-        file = ERR_PTR(err);
+                                FMODE_READ|FMODE_EXEC);
+        if (err)
-        if (!err) {
+                goto out;
-                struct inode *inode = nd.path.dentry->d_inode;
-                file = ERR_PTR(-EACCES);
+        err = -EACCES;
-                if (S_ISREG(inode->i_mode)) {
+        if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
-                        int err = vfs_permission(&nd, MAY_EXEC);
+                goto out_path_put;
-                        file = ERR_PTR(err);
-                        if (!err) {
+        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
-                                file = nameidata_to_filp(&nd,
+                goto out_path_put;
-                                                        O_RDONLY|O_LARGEFILE);
-                                if (!IS_ERR(file)) {
+        err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
-                                        err = deny_write_access(file);
+        if (err)
-                                        if (err) {
+                goto out_path_put;
-                                                fput(file);
-                                                file = ERR_PTR(err);
+        file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
-                                        }
+        if (IS_ERR(file))
-                                }
+                return file;
-out:
-                                return file;
+        err = deny_write_access(file);
-                        }
+        if (err) {
-                }
+                fput(file);
-                release_open_intent(&nd);
+                goto out;
-                path_put(&nd.path);
        }
-        goto out;
-}
+        return file;
+ out_path_put:
+        release_open_intent(&nd);
+        path_put(&nd.path);
+ out:
+        return ERR_PTR(err);
+}
 EXPORT_SYMBOL(open_exec);
 int kernel_read(struct file *file, unsigned long offset,
@@ -724,12 +738,10 @@ static int exec_mmap(struct mm_struct *mm)
                 * Make sure that if there is a core dump in progress
                 * for the old mm, we get out and die instead of going
                 * through with the exec.  We must hold mmap_sem around
-                 * checking core_waiters and changing tsk->mm.  The
+                 * checking core_state and changing tsk->mm.
-                 * core-inducing thread will increment core_waiters for
-                 * each thread whose ->mm == old_mm.
                 */
                down_read(&old_mm->mmap_sem);
-                if (unlikely(old_mm->core_waiters)) {
+                if (unlikely(old_mm->core_state)) {
                        up_read(&old_mm->mmap_sem);
                        return -EINTR;
                }
@@ -1075,13 +1087,8 @@ EXPORT_SYMBOL(prepare_binprm);
 static int unsafe_exec(struct task_struct *p)
 {
-        int unsafe = 0;
+        int unsafe = tracehook_unsafe_exec(p);
-        if (p->ptrace & PT_PTRACED) {
-                if (p->ptrace & PT_PTRACE_CAP)
-                        unsafe |= LSM_UNSAFE_PTRACE_CAP;
-                else
-                        unsafe |= LSM_UNSAFE_PTRACE;
-        }
        if (atomic_read(&p->fs->count) > 1 ||
            atomic_read(&p->files->count) > 1 ||
            atomic_read(&p->sighand->count) > 1)
@@ -1218,6 +1225,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                        read_unlock(&binfmt_lock);
                        retval = fn(bprm, regs);
                        if (retval >= 0) {
+                                tracehook_report_exec(fmt, bprm, regs);
                                put_binfmt(fmt);
                                allow_write_access(bprm->file);
                                if (bprm->file)
@@ -1328,6 +1336,7 @@ int do_execve(char * filename,
        if (retval < 0)
                goto out;
+        current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
        if (retval >= 0) {
                /* execve success */
@@ -1382,17 +1391,14 @@ EXPORT_SYMBOL(set_binfmt);
 * name into corename, which must have space for at least
 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
 */
-static int format_corename(char *corename, const char *pattern, long signr)
+static int format_corename(char *corename, int nr_threads, long signr)
 {
-        const char *pat_ptr = pattern;
+        const char *pat_ptr = core_pattern;
+        int ispipe = (*pat_ptr == '|');
        char *out_ptr = corename;
        char *const out_end = corename + CORENAME_MAX_SIZE;
        int rc;
        int pid_in_pattern = 0;
-        int ispipe = 0;
-        if (*pattern == '|')
-                ispipe = 1;
        /* Repeat as long as we have more pattern to process and more output
           space */
@@ -1493,7 +1499,7 @@ static int format_corename(char *corename, const char *pattern, long signr)
         * and core_uses_pid is set, then .%pid will be appended to
         * the filename. Do not do this for piped commands. */
        if (!ispipe && !pid_in_pattern
-            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
+            && (core_uses_pid || nr_threads)) {
                rc = snprintf(out_ptr, out_end - out_ptr,
                              ".%d", task_tgid_vnr(current));
                if (rc > out_end - out_ptr)
@@ -1505,9 +1511,10 @@ out:
        return ispipe;
 }
-static void zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start)
 {
        struct task_struct *t;
+        int nr = 0;
        start->signal->flags = SIGNAL_GROUP_EXIT;
        start->signal->group_stop_count = 0;
@@ -1515,72 +1522,99 @@ static void zap_process(struct task_struct *start)
        t = start;
        do {
                if (t != current && t->mm) {
-                        t->mm->core_waiters++;
                        sigaddset(&t->pending.signal, SIGKILL);
                        signal_wake_up(t, 1);
+                        nr++;
                }
-        } while ((t = next_thread(t)) != start);
+        } while_each_thread(start, t);
+        return nr;
 }
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-                                int exit_code)
+                                struct core_state *core_state, int exit_code)
 {
        struct task_struct *g, *p;
        unsigned long flags;
-        int err = -EAGAIN;
+        int nr = -EAGAIN;
        spin_lock_irq(&tsk->sighand->siglock);
        if (!signal_group_exit(tsk->signal)) {
+                mm->core_state = core_state;
                tsk->signal->group_exit_code = exit_code;
-                zap_process(tsk);
+                nr = zap_process(tsk);
-                err = 0;
        }
        spin_unlock_irq(&tsk->sighand->siglock);
-        if (err)
+        if (unlikely(nr < 0))
-                return err;
+                return nr;
-        if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+        if (atomic_read(&mm->mm_users) == nr + 1)
                goto done;
+        /*
+         * We should find and kill all tasks which use this mm, and we should
+         * count them correctly into ->nr_threads. We don't take tasklist
+         * lock, but this is safe wrt:
+         *
+         * fork:
+         *      None of sub-threads can fork after zap_process(leader). All
+         *      processes which were created before this point should be
+         *      visible to zap_threads() because copy_process() adds the new
+         *      process to the tail of init_task.tasks list, and lock/unlock
+         *      of ->siglock provides a memory barrier.
+         *
+         * do_exit:
+         *      The caller holds mm->mmap_sem. This means that the task which
+         *      uses this mm can't pass exit_mm(), so it can't exit or clear
+         *      its ->mm.
+         *
+         * de_thread:
+         *      It does list_replace_rcu(&leader->tasks, &current->tasks),
+         *      we must see either old or new leader, this does not matter.
+         *      However, it can change p->sighand, so lock_task_sighand(p)
+         *      must be used. Since p->mm != NULL and we hold ->mmap_sem
+         *      it can't fail.
+         *
+         *      Note also that "g" can be the old leader with ->mm == NULL
+         *      and already unhashed and thus removed from ->thread_group.
+         *      This is OK, __unhash_process()->list_del_rcu() does not
+         *      clear the ->next pointer, we will find the new leader via
+         *      next_thread().
+         */
        rcu_read_lock();
        for_each_process(g) {
                if (g == tsk->group_leader)
                        continue;
+                if (g->flags & PF_KTHREAD)
+                        continue;
                p = g;
                do {
                        if (p->mm) {
-                                if (p->mm == mm) {
+                                if (unlikely(p->mm == mm)) {
-                                        /*
-                                         * p->sighand can't disappear, but
-                                         * may be changed by de_thread()
-                                         */
                                        lock_task_sighand(p, &flags);
-                                        zap_process(p);
+                                        nr += zap_process(p);
                                        unlock_task_sighand(p, &flags);
                                }
                                break;
                        }
-                } while ((p = next_thread(p)) != g);
+                } while_each_thread(g, p);
        }
        rcu_read_unlock();
 done:
-        return mm->core_waiters;
+        atomic_set(&core_state->nr_threads, nr);
+        return nr;
 }
-static int coredump_wait(int exit_code)
+static int coredump_wait(int exit_code, struct core_state *core_state)
 {
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
-        struct completion startup_done;
        struct completion *vfork_done;
        int core_waiters;
-        init_completion(&mm->core_done);
+        init_completion(&core_state->startup);
-        init_completion(&startup_done);
+        core_state->dumper.task = tsk;
-        mm->core_startup_done = &startup_done;
+        core_state->dumper.next = NULL;
+        core_waiters = zap_threads(tsk, mm, core_state, exit_code);
-        core_waiters = zap_threads(tsk, mm, exit_code);
        up_write(&mm->mmap_sem);
        if (unlikely(core_waiters < 0))
@@ -1597,12 +1631,32 @@ static int coredump_wait(int exit_code)
        }
        if (core_waiters)
-                wait_for_completion(&startup_done);
+                wait_for_completion(&core_state->startup);
 fail:
-        BUG_ON(mm->core_waiters);
        return core_waiters;
 }
+static void coredump_finish(struct mm_struct *mm)
+{
+        struct core_thread *curr, *next;
+        struct task_struct *task;
+        next = mm->core_state->dumper.next;
+        while ((curr = next) != NULL) {
+                next = curr->next;
+                task = curr->task;
+                /*
+                 * see exit_mm(), curr->task must not see
+                 * ->task == NULL before we read ->next.
+                 */
+                smp_mb();
+                curr->task = NULL;
+                wake_up_process(task);
+        }
+        mm->core_state = NULL;
+}
 /*
 * set_dumpable converts traditional three-value dumpable to two flags and
 * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
@@ -1654,6 +1708,7 @@ int get_dumpable(struct mm_struct *mm)
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 {
+        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
@@ -1677,7 +1732,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        /*
         * If another thread got here first, or we are not dumpable, bail out.
         */
-        if (mm->core_waiters || !get_dumpable(mm)) {
+        if (mm->core_state || !get_dumpable(mm)) {
                up_write(&mm->mmap_sem);
                goto fail;
        }
@@ -1692,7 +1747,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
                current->fsuid = 0;     /* Dump root private */
        }
-        retval = coredump_wait(exit_code);
+        retval = coredump_wait(exit_code, &core_state);
        if (retval < 0)
                goto fail;
@@ -1707,7 +1762,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
         * uses lock_kernel()
         */
        lock_kernel();
-        ispipe = format_corename(corename, core_pattern, signr);
+        ispipe = format_corename(corename, retval, signr);
        unlock_kernel();
        /*
         * Don't bother to check the RLIMIT_CORE value if core_pattern points
@@ -1786,7 +1841,7 @@ fail_unlock:
                argv_free(helper_argv);
        current->fsuid = fsuid;
-        complete_all(&mm->core_done);
+        coredump_finish(mm);
 fail:
        return retval;
 }
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index e58669e1b87c..ae8c4f850b27 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -294,7 +294,7 @@ ext2_check_acl(struct inode *inode, int mask)
 }
 int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext2_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, ext2_check_acl);
 }
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0bde85bafe38..b42cf578554b 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -58,7 +58,7 @@ static inline int ext2_acl_count(size_t size)
 #define EXT2_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
-extern int ext2_permission (struct inode *, int, struct nameidata *);
+extern int ext2_permission (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d1dd74..991d6dfeb51f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -791,6 +791,7 @@ const struct address_space_operations ext2_aops = {
        .direct_IO              = ext2_direct_IO,
        .writepages             = ext2_writepages,
        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 const struct address_space_operations ext2_aops_xip = {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ef50cbc792db..fd88c7b43e66 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,6 +31,7 @@
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/log2.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -158,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
        kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index eaa23d2d5213..70c0dbdcdcb7 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -14,7 +14,7 @@ static size_t
 ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
                         const char *name, size_t name_len)
 {
-        const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+        const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (list && total_len <= list_size) {
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 83ee149f353d..e8219f8eae9f 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -12,13 +12,11 @@
 #include <linux/ext2_fs.h>
 #include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
 static size_t
 ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
                        const char *name, size_t name_len)
 {
-        const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+        const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index f383e7c3a7b5..92495d28c62f 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,13 +11,11 @@
 #include "ext2.h"
 #include "xattr.h"
-#define XATTR_USER_PREFIX "user."
 static size_t
 ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
                     const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a754d1848173..b60bb241880c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -299,7 +299,7 @@ ext3_check_acl(struct inode *inode, int mask)
 }
 int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext3_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, ext3_check_acl);
 }
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 0d1e6279cbfd..42da16b8cac0 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -58,7 +58,7 @@ static inline int ext3_acl_count(size_t size)
 #define EXT3_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
-extern int ext3_permission (struct inode *, int, struct nameidata *);
+extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 8ca3bfd72427..2eea96ec78ed 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -272,7 +272,7 @@ static void free_rb_tree_fname(struct rb_root *root)
        while (n) {
                /* Do the node's children first */
-                if ((n)->rb_left) {
+                if (n->rb_left) {
                        n = n->rb_left;
                        continue;
                }
@@ -301,24 +301,18 @@ static void free_rb_tree_fname(struct rb_root *root)
                        parent->rb_right = NULL;
                n = parent;
        }
-        root->rb_node = NULL;
 }
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
 {
        struct dir_private_info *p;
-        p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+        p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
        if (!p)
                return NULL;
-        p->root.rb_node = NULL;
-        p->curr_node = NULL;
-        p->extra_fname = NULL;
-        p->last_pos = 0;
        p->curr_hash = pos2maj_hash(pos);
        p->curr_minor_hash = pos2min_hash(pos);
-        p->next_hash = 0;
        return p;
 }
@@ -433,7 +427,7 @@ static int ext3_dx_readdir(struct file * filp,
        int     ret;
        if (!info) {
-                info = create_dir_info(filp->f_pos);
+                info = ext3_htree_create_dir_info(filp->f_pos);
                if (!info)
                        return -ENOMEM;
                filp->private_data = info;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 77126821b2e9..47b678d73e7a 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -669,6 +669,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
        if (IS_ERR(inode))
                goto iget_failed;
+        /*
+         * If the orphans has i_nlinks > 0 then it should be able to be
+         * truncated, otherwise it won't be removed from the orphan list
+         * during processing and an infinite loop will result.
+         */
+        if (inode->i_nlink && !ext3_can_truncate(inode))
+                goto bad_orphan;
        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
@@ -690,6 +698,7 @@ bad_orphan:
                printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+                printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 6ae4ecf3ce40..507d8689b111 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1767,44 +1767,47 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 }
 static const struct address_space_operations ext3_ordered_aops = {
-        .readpage       = ext3_readpage,
+        .readpage               = ext3_readpage,
-        .readpages      = ext3_readpages,
+        .readpages              = ext3_readpages,
-        .writepage      = ext3_ordered_writepage,
+        .writepage              = ext3_ordered_writepage,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext3_write_begin,
+        .write_begin            = ext3_write_begin,
-        .write_end      = ext3_ordered_write_end,
+        .write_end              = ext3_ordered_write_end,
-        .bmap           = ext3_bmap,
+        .bmap                   = ext3_bmap,
-        .invalidatepage = ext3_invalidatepage,
+        .invalidatepage         = ext3_invalidatepage,
-        .releasepage    = ext3_releasepage,
+        .releasepage            = ext3_releasepage,
-        .direct_IO      = ext3_direct_IO,
+        .direct_IO              = ext3_direct_IO,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 static const struct address_space_operations ext3_writeback_aops = {
-        .readpage       = ext3_readpage,
+        .readpage               = ext3_readpage,
-        .readpages      = ext3_readpages,
+        .readpages              = ext3_readpages,
-        .writepage      = ext3_writeback_writepage,
+        .writepage              = ext3_writeback_writepage,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext3_write_begin,
+        .write_begin            = ext3_write_begin,
-        .write_end      = ext3_writeback_write_end,
+        .write_end              = ext3_writeback_write_end,
-        .bmap           = ext3_bmap,
+        .bmap                   = ext3_bmap,
-        .invalidatepage = ext3_invalidatepage,
+        .invalidatepage         = ext3_invalidatepage,
-        .releasepage    = ext3_releasepage,
+        .releasepage            = ext3_releasepage,
-        .direct_IO      = ext3_direct_IO,
+        .direct_IO              = ext3_direct_IO,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 static const struct address_space_operations ext3_journalled_aops = {
-        .readpage       = ext3_readpage,
+        .readpage               = ext3_readpage,
-        .readpages      = ext3_readpages,
+        .readpages              = ext3_readpages,
-        .writepage      = ext3_journalled_writepage,
+        .writepage              = ext3_journalled_writepage,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext3_write_begin,
+        .write_begin            = ext3_write_begin,
-        .write_end      = ext3_journalled_write_end,
+        .write_end              = ext3_journalled_write_end,
-        .set_page_dirty = ext3_journalled_set_page_dirty,
+        .set_page_dirty         = ext3_journalled_set_page_dirty,
-        .bmap           = ext3_bmap,
+        .bmap                   = ext3_bmap,
-        .invalidatepage = ext3_invalidatepage,
+        .invalidatepage         = ext3_invalidatepage,
-        .releasepage    = ext3_releasepage,
+        .releasepage            = ext3_releasepage,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 void ext3_set_aops(struct inode *inode)
@@ -2127,7 +2130,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
-                ext3_journal_dirty_metadata(handle, this_bh);
+                /*
+                 * The buffer head should have an attached journal head at this
+                 * point. However, if the data is corrupted and an indirect
+                 * block pointed to itself, it would have been detached when
+                 * the block was cleared. Check for this instead of OOPSing.
+                 */
+                if (bh2jh(this_bh))
+                        ext3_journal_dirty_metadata(handle, this_bh);
+                else
+                        ext3_error(inode->i_sb, "ext3_free_data",
+                                   "circular indirect block detected, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long)this_bh->b_blocknr);
        }
 }
@@ -2253,6 +2270,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
        }
 }
+int ext3_can_truncate(struct inode *inode)
+{
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return 0;
+        if (S_ISREG(inode->i_mode))
+                return 1;
+        if (S_ISDIR(inode->i_mode))
+                return 1;
+        if (S_ISLNK(inode->i_mode))
+                return !ext3_inode_is_fast_symlink(inode);
+        return 0;
+}
 /*
 * ext3_truncate()
 *
@@ -2297,12 +2327,7 @@ void ext3_truncate(struct inode *inode)
        unsigned blocksize = inode->i_sb->s_blocksize;
        struct page *page;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+        if (!ext3_can_truncate(inode))
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext3_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
        /*
@@ -2513,6 +2538,16 @@ static int __ext3_get_inode_loc(struct inode *inode,
        }
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
+                /*
+                 * If the buffer has the write error flag, we have failed
+                 * to write out another inode in the same block.  In this
+                 * case, we don't have to read the block because we may
+                 * read the old inode data successfully.
+                 */
+                if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+                        set_buffer_uptodate(bh);
                if (buffer_uptodate(bh)) {
                        /* someone brought it uptodate while we waited */
                        unlock_buffer(bh);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0b8cf80154f1..de13e919cd81 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -240,13 +240,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
                EXT3_DIR_REC_LEN(2) - infosize;
-        return 0? 20: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 static inline unsigned dx_node_limit (struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
-        return 0? 22: entry_space / sizeof(struct dx_entry);
+        return entry_space / sizeof(struct dx_entry);
 }
 /*
@@ -991,19 +991,21 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
                de = (struct ext3_dir_entry_2 *) bh->b_data;
                top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT3_DIR_REC_LEN(0));
-                for (; de < top; de = ext3_next_entry(de))
+                for (; de < top; de = ext3_next_entry(de)) {
-                if (ext3_match (namelen, name, de)) {
+                        int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
-                        if (!ext3_check_dir_entry("ext3_find_entry",
+                                  + ((char *) de - bh->b_data);
-                                                  dir, de, bh,
-                                  (block<<EXT3_BLOCK_SIZE_BITS(sb))
+                        if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
-                                          +((char *)de - bh->b_data))) {
+                                brelse(bh);
-                                brelse (bh);
                                *err = ERR_BAD_DX_DIR;
                                goto errout;
                        }
-                        *res_dir = de;
-                        dx_release (frames);
+                        if (ext3_match(namelen, name, de)) {
-                        return bh;
+                                *res_dir = de;
+                                dx_release(frames);
+                                return bh;
+                        }
                }
                brelse (bh);
                /* Check to see if we should continue to search */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2845425077e8..f38a5afc39a1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode)
        kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
@@ -842,7 +842,7 @@ static int parse_options (char *options, struct super_block *sb,
        int data_opt = 0;
        int option;
 #ifdef CONFIG_QUOTA
-        int qtype;
+        int qtype, qfmt;
        char *qname;
 #endif
@@ -1018,9 +1018,11 @@ static int parse_options (char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if (sb_any_quota_enabled(sb)) {
+                        if ((sb_any_quota_enabled(sb) ||
+                             sb_any_quota_suspended(sb)) &&
+                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
-                                        "EXT3-fs: Cannot change journalled "
+                                        "EXT3-fs: Cannot change journaled "
                                        "quota options when quota turned on.\n");
                                return 0;
                        }
@@ -1056,9 +1058,11 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if (sb_any_quota_enabled(sb)) {
+                        if ((sb_any_quota_enabled(sb) ||
+                             sb_any_quota_suspended(sb)) &&
+                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
-                                        "journalled quota options when "
+                                        "journaled quota options when "
                                        "quota turned on.\n");
                                return 0;
                        }
@@ -1069,10 +1073,20 @@ clear_qf_name:
                        sbi->s_qf_names[qtype] = NULL;
                        break;
                case Opt_jqfmt_vfsold:
-                        sbi->s_jquota_fmt = QFMT_VFS_OLD;
+                        qfmt = QFMT_VFS_OLD;
-                        break;
+                        goto set_qf_format;
                case Opt_jqfmt_vfsv0:
-                        sbi->s_jquota_fmt = QFMT_VFS_V0;
+                        qfmt = QFMT_VFS_V0;
+set_qf_format:
+                        if ((sb_any_quota_enabled(sb) ||
+                             sb_any_quota_suspended(sb)) &&
+                            sbi->s_jquota_fmt != qfmt) {
+                                printk(KERN_ERR "EXT3-fs: Cannot change "
+                                        "journaled quota options when "
+                                        "quota turned on.\n");
+                                return 0;
+                        }
+                        sbi->s_jquota_fmt = qfmt;
                        break;
                case Opt_quota:
                case Opt_usrquota:
@@ -1084,7 +1098,8 @@ clear_qf_name:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb)) {
+                        if (sb_any_quota_enabled(sb) ||
+                            sb_any_quota_suspended(sb)) {
                                printk(KERN_ERR "EXT3-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
@@ -1169,14 +1184,14 @@ clear_qf_name:
                }
                if (!sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT3-fs: journalled quota format "
+                        printk(KERN_ERR "EXT3-fs: journaled quota format "
                                        "not specified.\n");
                        return 0;
                }
        } else {
                if (sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT3-fs: journalled quota format "
+                        printk(KERN_ERR "EXT3-fs: journaled quota format "
-                                        "specified with no journalling "
+                                        "specified with no journaling "
                                        "enabled.\n");
                        return 0;
                }
@@ -1370,7 +1385,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                        int ret = ext3_quota_on_mount(sb, i);
                        if (ret < 0)
                                printk(KERN_ERR
-                                        "EXT3-fs: Cannot turn on journalled "
+                                        "EXT3-fs: Cannot turn on journaled "
                                        "quota: error %d\n", ret);
                }
        }
@@ -2712,7 +2727,7 @@ static int ext3_release_dquot(struct dquot *dquot)
 static int ext3_mark_dquot_dirty(struct dquot *dquot)
 {
-        /* Are we journalling quotas? */
+        /* Are we journaling quotas? */
        if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
            EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
                dquot_mark_dquot_dirty(dquot);
@@ -2759,25 +2774,45 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* Not journalling quota or remount? */
+        /* When remounting, no checks are needed and in fact, path is NULL */
-        if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
+        if (remount)
-            !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount)
                return vfs_quota_on(sb, type, format_id, path, remount);
        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
        if (err)
                return err;
        /* Quotafile not on the same filesystem? */
        if (nd.path.mnt->mnt_sb != sb) {
                path_put(&nd.path);
                return -EXDEV;
        }
-        /* Quotafile not in fs root? */
+        /* Journaling quota? */
-        if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+        if (EXT3_SB(sb)->s_qf_names[type]) {
-                printk(KERN_WARNING
+                /* Quotafile not of fs root? */
-                        "EXT3-fs: Quota file not on filesystem root. "
+                if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-                        "Journalled quota will not work.\n");
+                        printk(KERN_WARNING
+                                "EXT3-fs: Quota file not on filesystem root. "
+                                "Journaled quota will not work.\n");
+        }
+        /*
+         * When we journal data on quota file, we have to flush journal to see
+         * all updates to the file when we bypass pagecache...
+         */
+        if (ext3_should_journal_data(nd.path.dentry->d_inode)) {
+                /*
+                 * We don't need to lock updates but journal_flush() could
+                 * otherwise be livelocked...
+                 */
+                journal_lock_updates(EXT3_SB(sb)->s_journal);
+                journal_flush(EXT3_SB(sb)->s_journal);
+                journal_unlock_updates(EXT3_SB(sb)->s_journal);
+        }
+        err = vfs_quota_on_path(sb, type, format_id, &nd.path);
        path_put(&nd.path);
-        return vfs_quota_on(sb, type, format_id, path, remount);
+        return err;
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 821efaf2b94e..37b81097bdf2 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -15,7 +15,7 @@ static size_t
 ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
                         const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 0327497a55ce..c7c41a410c4b 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
 static size_t
 ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
                        const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 1abd8f92c440..430fe63b31b3 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -12,13 +12,11 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
-#define XATTR_USER_PREFIX "user."
 static size_t
 ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
                     const char *name, size_t name_len)
 {
-        const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
        if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3c8dab880d91..694ed6fadcc8 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
-        for (n=0; n < count; n++) {
+        for (n = 0; n < count; n++) {
                ext4_acl_entry *entry =
                        (ext4_acl_entry *)value;
                if ((char *)value + sizeof(ext4_acl_entry_short) > end)
                        goto fail;
                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-                switch(acl->a_entries[n].e_tag) {
-                        case ACL_USER_OBJ:
+                switch (acl->a_entries[n].e_tag) {
-                        case ACL_GROUP_OBJ:
+                case ACL_USER_OBJ:
-                        case ACL_MASK:
+                case ACL_GROUP_OBJ:
-                        case ACL_OTHER:
+                case ACL_MASK:
-                                value = (char *)value +
+                case ACL_OTHER:
-                                        sizeof(ext4_acl_entry_short);
+                        value = (char *)value +
-                                acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+                                sizeof(ext4_acl_entry_short);
-                                break;
+                        acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+                        break;
-                        case ACL_USER:
-                        case ACL_GROUP:
+                case ACL_USER:
-                                value = (char *)value + sizeof(ext4_acl_entry);
+                case ACL_GROUP:
-                                if ((char *)value > end)
+                        value = (char *)value + sizeof(ext4_acl_entry);
-                                        goto fail;
+                        if ((char *)value > end)
-                                acl->a_entries[n].e_id =
-                                        le32_to_cpu(entry->e_id);
-                                break;
-                        default:
                                goto fail;
+                        acl->a_entries[n].e_id =
+                                le32_to_cpu(entry->e_id);
+                        break;
+                default:
+                        goto fail;
                }
        }
        if (value != end)
@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
                return ERR_PTR(-ENOMEM);
        ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
        e = (char *)ext_acl + sizeof(ext4_acl_header);
-        for (n=0; n < acl->a_count; n++) {
+        for (n = 0; n < acl->a_count; n++) {
                ext4_acl_entry *entry = (ext4_acl_entry *)e;
                entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
                entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-                switch(acl->a_entries[n].e_tag) {
+                switch (acl->a_entries[n].e_tag) {
-                        case ACL_USER:
+                case ACL_USER:
-                        case ACL_GROUP:
+                case ACL_GROUP:
-                                entry->e_id =
+                        entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
-                                        cpu_to_le32(acl->a_entries[n].e_id);
+                        e += sizeof(ext4_acl_entry);
-                                e += sizeof(ext4_acl_entry);
+                        break;
-                                break;
+                case ACL_USER_OBJ:
-                        case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
-                        case ACL_GROUP_OBJ:
+                case ACL_MASK:
-                        case ACL_MASK:
+                case ACL_OTHER:
-                        case ACL_OTHER:
+                        e += sizeof(ext4_acl_entry_short);
-                                e += sizeof(ext4_acl_entry_short);
+                        break;
-                                break;
+                default:
-                        default:
+                        goto fail;
-                                goto fail;
                }
        }
        return (char *)ext_acl;
@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return NULL;
-        switch(type) {
+        switch (type) {
-                case ACL_TYPE_ACCESS:
+        case ACL_TYPE_ACCESS:
-                        acl = ext4_iget_acl(inode, &ei->i_acl);
+                acl = ext4_iget_acl(inode, &ei->i_acl);
-                        if (acl != EXT4_ACL_NOT_CACHED)
+                if (acl != EXT4_ACL_NOT_CACHED)
-                                return acl;
+                        return acl;
-                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
-                        break;
+                break;
-                case ACL_TYPE_DEFAULT:
+        case ACL_TYPE_DEFAULT:
-                        acl = ext4_iget_acl(inode, &ei->i_default_acl);
+                acl = ext4_iget_acl(inode, &ei->i_default_acl);
-                        if (acl != EXT4_ACL_NOT_CACHED)
+                if (acl != EXT4_ACL_NOT_CACHED)
-                                return acl;
+                        return acl;
-                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
-                        break;
+                break;
-                default:
+        default:
-                        return ERR_PTR(-EINVAL);
+                return ERR_PTR(-EINVAL);
        }
        retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
        kfree(value);
        if (!IS_ERR(acl)) {
-                switch(type) {
+                switch (type) {
-                        case ACL_TYPE_ACCESS:
+                case ACL_TYPE_ACCESS:
-                                ext4_iset_acl(inode, &ei->i_acl, acl);
+                        ext4_iset_acl(inode, &ei->i_acl, acl);
-                                break;
+                        break;
-                        case ACL_TYPE_DEFAULT:
+                case ACL_TYPE_DEFAULT:
-                                ext4_iset_acl(inode, &ei->i_default_acl, acl);
+                        ext4_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
+                        break;
                }
        }
        return acl;
@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        switch(type) {
+        switch (type) {
-                case ACL_TYPE_ACCESS:
+        case ACL_TYPE_ACCESS:
-                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
-                        if (acl) {
+                if (acl) {
-                                mode_t mode = inode->i_mode;
+                        mode_t mode = inode->i_mode;
-                                error = posix_acl_equiv_mode(acl, &mode);
+                        error = posix_acl_equiv_mode(acl, &mode);
-                                if (error < 0)
+                        if (error < 0)
-                                        return error;
+                                return error;
-                                else {
+                        else {
-                                        inode->i_mode = mode;
+                                inode->i_mode = mode;
-                                        ext4_mark_inode_dirty(handle, inode);
+                                ext4_mark_inode_dirty(handle, inode);
-                                        if (error == 0)
+                                if (error == 0)
-                                                acl = NULL;
+                                        acl = NULL;
-                                }
                        }
-                        break;
+                }
+                break;
-                case ACL_TYPE_DEFAULT:
+        case ACL_TYPE_DEFAULT:
-                        name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
-                        if (!S_ISDIR(inode->i_mode))
+                if (!S_ISDIR(inode->i_mode))
-                                return acl ? -EACCES : 0;
+                        return acl ? -EACCES : 0;
-                        break;
+                break;
-                default:
+        default:
-                        return -EINVAL;
+                return -EINVAL;
        }
        if (acl) {
                value = ext4_acl_to_disk(acl, &size);
@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        kfree(value);
        if (!error) {
-                switch(type) {
+                switch (type) {
-                        case ACL_TYPE_ACCESS:
+                case ACL_TYPE_ACCESS:
-                                ext4_iset_acl(inode, &ei->i_acl, acl);
+                        ext4_iset_acl(inode, &ei->i_acl, acl);
-                                break;
+                        break;
-                        case ACL_TYPE_DEFAULT:
+                case ACL_TYPE_DEFAULT:
-                                ext4_iset_acl(inode, &ei->i_default_acl, acl);
+                        ext4_iset_acl(inode, &ei->i_default_acl, acl);
-                                break;
+                        break;
                }
        }
        return error;
@@ -299,7 +299,7 @@ ext4_check_acl(struct inode *inode, int mask)
 }
 int
-ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext4_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, ext4_check_acl);
 }
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 26a5c1abf147..cd2b855a07d6 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -58,7 +58,7 @@ static inline int ext4_acl_count(size_t size)
 #define EXT4_ACL_NOT_CACHED ((void *)-1)
 /* acl.c */
-extern int ext4_permission (struct inode *, int, struct nameidata *);
+extern int ext4_permission (struct inode *, int);
 extern int ext4_acl_chmod (struct inode *);
 extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 495ab21b9832..e9fa960ba6da 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -314,25 +314,28 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                            "block_group = %d, block_bitmap = %llu",
+                            "block_group = %lu, block_bitmap = %llu",
-                            (int)block_group, (unsigned long long)bitmap_blk);
+                            block_group, bitmap_blk);
                return NULL;
        }
        if (bh_uptodate_or_lock(bh))
                return bh;
+        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
+                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
                return bh;
        }
+        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                            "block_group = %d, block_bitmap = %llu",
+                            "block_group = %lu, block_bitmap = %llu",
-                            (int)block_group, (unsigned long long)bitmap_blk);
+                            block_group, bitmap_blk);
                return NULL;
        }
        ext4_valid_block_bitmap(sb, desc, block_group, bh);
@@ -1623,6 +1626,9 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
                free_blocks =
                        percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
 #endif
+        if (free_blocks <= root_blocks)
+                /* we don't have free space */
+                return 0;
        if (free_blocks - root_blocks < nblocks)
                return free_blocks - root_blocks;
        return nblocks;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d3d23d73c08b..ec8e33b45219 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -411,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent,
                                get_dtype(sb, fname->file_type));
                if (error) {
                        filp->f_pos = curr_pos;
-                        info->extra_fname = fname->next;
+                        info->extra_fname = fname;
                        return error;
                }
                fname = fname->next;
@@ -450,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp,
         * If there are any leftover names on the hash collision
         * chain, return them first.
         */
-        if (info->extra_fname &&
+        if (info->extra_fname) {
-            call_filldir(filp, dirent, filldir, info->extra_fname))
+                if (call_filldir(filp, dirent, filldir, info->extra_fname))
-                goto finished;
+                        goto finished;
-        if (!info->curr_node)
+                info->extra_fname = NULL;
+                info->curr_node = rb_next(info->curr_node);
+                if (!info->curr_node) {
+                        if (info->next_hash == ~0) {
+                                filp->f_pos = EXT4_HTREE_EOF;
+                                goto finished;
+                        }
+                        info->curr_hash = info->next_hash;
+                        info->curr_minor_hash = 0;
+                }
+        } else if (!info->curr_node)
                info->curr_node = rb_first(&info->root);
        while (1) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 303e41cf7b14..295003241d3d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1044,7 +1044,6 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 /* inode.c */
-void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1073,6 +1072,8 @@ extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
@@ -1228,6 +1229,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned long max_blocks, struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6c166c0a54b7..d33dc56d6986 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -216,7 +216,9 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+                                                   int num,
+                                                   struct ext4_ext_path *path);
 extern int ext4_ext_try_to_merge(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index eb8bc3afe6e9..b455c685a98b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)      (EXT4_XATTR_TRANS_BLOCKS + \
+                                        2*EXT4_QUOTA_TRANS_BLOCKS(sb))
 /* Delete operations potentially hit one directory's namespace plus an
 * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
 * generous.  We can grow the delete transaction later if necessary. */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 42c4c0c892ed..b24d3c53f20c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -99,7 +99,7 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
        if (handle->h_buffer_credits > needed)
                return 0;
        err = ext4_journal_extend(handle, needed);
-        if (err)
+        if (err <= 0)
                return err;
        return ext4_journal_restart(handle, needed);
 }
@@ -1441,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
        /*
         * get the next allocated block if the extent in the path
-         * is before the requested block(s) 
+         * is before the requested block(s)
         */
        if (b2 < b1) {
                b2 = ext4_ext_next_allocated_block(path);
@@ -1747,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 }
 /*
- * ext4_ext_calc_credits_for_insert:
+ * ext4_ext_calc_credits_for_single_extent:
- * This routine returns max. credits that the extent tree can consume.
+ * This routine returns max. credits that needed to insert an extent
- * It should be OK for low-performance paths like ->writepage()
+ * to the extent tree.
- * To allow many writing processes to fit into a single transaction,
+ * When pass the actual path, the caller should calculate credits
- * the caller should calculate credits under i_data_sem and
+ * under i_data_sem.
- * pass the actual path.
 */
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                struct ext4_ext_path *path)
 {
-        int depth, needed;
        if (path) {
+                int depth = ext_depth(inode);
+                int ret = 0;
                /* probably there is space in leaf? */
-                depth = ext_depth(inode);
                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
-                                < le16_to_cpu(path[depth].p_hdr->eh_max))
+                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {
-                        return 1;
-        }
-        /*
-         * given 32-bit logical block (4294967296 blocks), max. tree
-         * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
-         * Let's also add one more level for imbalance.
-         */
-        depth = 5;
-        /* allocation of new data block(s) */
+                        /*
-        needed = 2;
+                         *  There are some space in the leaf tree, no
+                         *  need to account for leaf block credit
+                         *
+                         *  bitmaps and block group descriptor blocks
+                         *  and other metadat blocks still need to be
+                         *  accounted.
+                         */
+                        /* 1 bitmap, 1 block group descriptor */
+                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+                }
+        }
-        /*
+        return ext4_chunk_trans_blocks(inode, nrblocks);
-         * tree can be full, so it would need to grow in depth:
+}
-         * we need one credit to modify old root, credits for
-         * new root will be added in split accounting
-         */
-        needed += 1;
-        /*
+/*
-         * Index split can happen, we would need:
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
-         *    allocate intermediate indexes (bitmap + group)
+ *
-         *  + change two blocks at each level, but root (already included)
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
-         */
+ * in the worse case, each tree level index/leaf need to be changed
-        needed += (depth * 2) + (depth * 2);
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int index;
+        int depth = ext_depth(inode);
-        /* any allocation modifies superblock */
+        if (chunk)
-        needed += 1;
+                index = depth * 2;
+        else
+                index = depth * 3;
-        return needed;
+        return index;
 }
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1910,16 +1917,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        BUG_ON(b != ex_ee_block + ex_ee_len - 1);
                }
-                /* at present, extent can't cross block group: */
+                /*
-                /* leaf + bitmap + group desc + sb + inode */
+                 * 3 for leaf, sb, and inode plus 2 (bmap and group
-                credits = 5;
+                 * descriptor) for each block group; assume two block
+                 * groups plus ex_ee_len/blocks_per_block_group for
+                 * the worst case
+                 */
+                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
                if (ex == EXT_FIRST_EXTENT(eh)) {
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
-#ifdef CONFIG_QUOTA
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
                err = ext4_ext_journal_restart(handle, credits);
                if (err)
@@ -2323,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                unsigned int newdepth;
                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
                if (allocated <= EXT4_EXT_ZERO_LEN) {
-                        /* Mark first half uninitialized.
+                        /*
+                         * iblock == ee_block is handled by the zerouout
+                         * at the beginning.
+                         * Mark first half uninitialized.
                         * Mark second half initialized and zero out the
                         * initialized extent
                         */
@@ -2346,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                ex->ee_len   = orig_ex.ee_len;
                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* zeroed the full extent */
+                                /* blocks available from iblock */
                                return allocated;
                        } else if (err)
@@ -2374,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                        err = PTR_ERR(path);
                                        return err;
                                }
+                                /* get the second half extent details */
                                ex = path[depth].p_ext;
                                err = ext4_ext_get_access(handle, inode,
                                                                path + depth);
@@ -2403,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
+                        /* blocks available from iblock */
                        return allocated;
                } else if (err)
@@ -2418,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 */
                orig_ex.ee_len = cpu_to_le16(ee_len -
                                                ext4_ext_get_actual_len(ex3));
-                if (newdepth != depth) {
+                depth = newdepth;
-                        depth = newdepth;
+                ext4_ext_drop_refs(path);
-                        ext4_ext_drop_refs(path);
+                path = ext4_ext_find_extent(inode, iblock, path);
-                        path = ext4_ext_find_extent(inode, iblock, path);
+                if (IS_ERR(path)) {
-                        if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
-                                err = PTR_ERR(path);
+                        goto out;
-                                goto out;
-                        }
-                        eh = path[depth].p_hdr;
-                        ex = path[depth].p_ext;
-                        if (ex2 != &newex)
-                                ex2 = ex;
-                        err = ext4_ext_get_access(handle, inode, path + depth);
-                        if (err)
-                                goto out;
                }
+                eh = path[depth].p_hdr;
+                ex = path[depth].p_ext;
+                if (ex2 != &newex)
+                        ex2 = ex;
+                err = ext4_ext_get_access(handle, inode, path + depth);
+                if (err)
+                        goto out;
                allocated = max_blocks;
                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
@@ -2452,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
+                        /* blocks available from iblock */
                        return allocated;
                }
        }
@@ -2796,7 +2810,7 @@ void ext4_ext_truncate(struct inode *inode)
        /*
         * probably first extent we're gonna free will be last in block
         */
-        err = ext4_writepage_trans_blocks(inode) + 3;
+        err = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, err);
        if (IS_ERR(handle))
                return;
@@ -2810,7 +2824,7 @@ void ext4_ext_truncate(struct inode *inode)
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
-        ext4_mb_discard_inode_preallocations(inode);
+        ext4_discard_reservation(inode);
        /*
         * TODO: optimization is possible here.
@@ -2849,27 +2863,6 @@ out_stop:
        ext4_journal_stop(handle);
 }
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
-{
-        int needed;
-        needed = ext4_ext_calc_credits_for_insert(inode, NULL);
-        /* caller wants to allocate num blocks, but note it includes sb */
-        needed = needed * num - (num - 1);
-#ifdef CONFIG_QUOTA
-        needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-        return needed;
-}
 static void ext4_falloc_update_inode(struct inode *inode,
                                int mode, loff_t new_size, int update_ctime)
 {
@@ -2930,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
                                                        - block;
        /*
-         * credits to insert 1 extent into extent tree + buffers to be able to
+         * credits to insert 1 extent into extent tree
-         * modify 1 super block, 1 block bitmap and 1 group descriptor.
         */
-        credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
 retry:
        while (ret >= 0 && ret < max_blocks) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index a92eb305344f..f344834bbf58 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 * Return buffer_head of bitmap on success or NULL.
 */
 static struct buffer_head *
-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
        struct ext4_group_desc *desc;
        struct buffer_head *bh = NULL;
+        ext4_fsblk_t bitmap_blk;
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
-                goto error_out;
+                return NULL;
-        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+        bitmap_blk = ext4_inode_bitmap(sb, desc);
-                bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
+        bh = sb_getblk(sb, bitmap_blk);
-                if (!buffer_uptodate(bh)) {
+        if (unlikely(!bh)) {
-                        lock_buffer(bh);
+                ext4_error(sb, __func__,
-                        if (!buffer_uptodate(bh)) {
+                            "Cannot read inode bitmap - "
-                                ext4_init_inode_bitmap(sb, bh, block_group,
+                            "block_group = %lu, inode_bitmap = %llu",
-                                                       desc);
+                            block_group, bitmap_blk);
-                                set_buffer_uptodate(bh);
+                return NULL;
-                        }
-                        unlock_buffer(bh);
-                }
-        } else {
-                bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
        }
-        if (!bh)
+        if (bh_uptodate_or_lock(bh))
-                ext4_error(sb, "read_inode_bitmap",
+                return bh;
+        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                ext4_init_inode_bitmap(sb, bh, block_group, desc);
+                set_buffer_uptodate(bh);
+                unlock_buffer(bh);
+                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                return bh;
+        }
+        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        if (bh_submit_read(bh) < 0) {
+                put_bh(bh);
+                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
                            "block_group = %lu, inode_bitmap = %llu",
-                            block_group, ext4_inode_bitmap(sb, desc));
+                            block_group, bitmap_blk);
-error_out:
+                return NULL;
+        }
        return bh;
 }
@@ -200,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
-        bitmap_bh = read_inode_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
@@ -341,7 +351,7 @@ find_close_to_parent:
                        goto found_flexbg;
                }
-                if (best_flex < 0 ||
+                if (flex_group[best_flex].free_inodes == 0 ||
                    (flex_group[i].free_blocks >
                     flex_group[best_flex].free_blocks &&
                     flex_group[i].free_inodes))
@@ -623,7 +633,7 @@ got_group:
                        goto fail;
                brelse(bitmap_bh);
-                bitmap_bh = read_inode_bitmap(sb, group);
+                bitmap_bh = ext4_read_inode_bitmap(sb, group);
                if (!bitmap_bh)
                        goto fail;
@@ -728,7 +738,7 @@ got:
                        /* When marking the block group with
                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                         * on the value of bg_itable_unsed even though
+                         * on the value of bg_itable_unused even though
                         * mke2fs could have initialized the same for us.
                         * Instead we calculated the value below
                         */
@@ -891,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
-        bitmap_bh = read_inode_bitmap(sb, block_group);
+        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (!bitmap_bh) {
                ext4_warning(sb, __func__,
                             "inode bitmap error for orphan %lu", ino);
@@ -969,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
                        continue;
                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
                brelse(bitmap_bh);
-                bitmap_bh = read_inode_bitmap(sb, i);
+                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (!bitmap_bh)
                        continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8ca2763df091..7e91913e325b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
+#define MPAGE_DA_EXTENT_TAIL 0x01
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
@@ -191,6 +193,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 void ext4_delete_inode (struct inode * inode)
 {
        handle_t *handle;
+        int err;
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
@@ -199,8 +202,9 @@ void ext4_delete_inode (struct inode * inode)
        if (is_bad_inode(inode))
                goto no_delete;
-        handle = start_transaction(inode);
+        handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
        if (IS_ERR(handle)) {
+                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
                 * If we're going to skip the normal cleanup, we still need to
                 * make sure that the in-core orphan linked list is properly
@@ -213,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
        if (IS_SYNC(inode))
                handle->h_sync = 1;
        inode->i_size = 0;
+        err = ext4_mark_inode_dirty(handle, inode);
+        if (err) {
+                ext4_warning(inode->i_sb, __func__,
+                             "couldn't mark inode dirty (err %d)", err);
+                goto stop_handle;
+        }
        if (inode->i_blocks)
                ext4_truncate(inode);
+        /*
+         * ext4_ext_truncate() doesn't reserve any slop when it
+         * restarts journal transactions; therefore there may not be
+         * enough credits left in the handle to remove the inode from
+         * the orphan list and set the dtime field.
+         */
+        if (handle->h_buffer_credits < 3) {
+                err = ext4_journal_extend(handle, 3);
+                if (err > 0)
+                        err = ext4_journal_restart(handle, 3);
+                if (err != 0) {
+                        ext4_warning(inode->i_sb, __func__,
+                                     "couldn't extend journal (err %d)", err);
+                stop_handle:
+                        ext4_journal_stop(handle);
+                        goto no_delete;
+                }
+        }
        /*
         * Kill off the orphan record which ext4_truncate created.
         * AKPM: I think this can be inside the above `if'.
@@ -952,23 +982,74 @@ out:
        return err;
 }
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
 /*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * Calculate the number of metadata blocks need to reserve
- * We need sb + group descriptor + bitmap + inode -> 4
+ * to allocate @blocks for non extent file based file
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
 */
-#define DIO_CREDITS 25
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        int ind_blks, dind_blks, tind_blks;
+        /* number of new indirect blocks needed */
+        ind_blks = (blocks + icap - 1) / icap;
+        dind_blks = (ind_blks + icap - 1) / icap;
+        tind_blks = 1;
+        return ind_blks + dind_blks + tind_blks;
+}
 /*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+        if (!blocks)
+                return 0;
+        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+                return ext4_ext_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+static void ext4_da_update_reserve_space(struct inode *inode, int used)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int total, mdb, mdb_free;
+        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        /* recalculate the number of metablocks still need to be reserved */
+        total = EXT4_I(inode)->i_reserved_data_blocks - used;
+        mdb = ext4_calc_metadata_amount(inode, total);
+        /* figure out how many metablocks to release */
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+        /* Account for allocated meta_blocks */
+        mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        /* update fs free blocks counter for truncate case */
+        percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+        /* update per-inode reservations */
+        BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
+        EXT4_I(inode)->i_reserved_data_blocks -= used;
+        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+        EXT4_I(inode)->i_allocated_meta_blocks = 0;
+        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+/*
+ * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * and returns if the blocks are already mapped.
 *
- *
- * ext4_ext4 get_block() wrapper function
- * It will do a look up first, and returns if the blocks already mapped.
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
@@ -1069,26 +1150,30 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                 * which were deferred till now
                 */
                if ((retval > 0) && buffer_delay(bh))
-                        ext4_da_release_space(inode, retval, 0);
+                        ext4_da_update_reserve_space(inode, retval);
        }
        up_write((&EXT4_I(inode)->i_data_sem));
        return retval;
 }
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
 static int ext4_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
        handle_t *handle = ext4_journal_current_handle();
        int ret = 0, started = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int dio_credits;
        if (create && !handle) {
                /* Direct IO write... */
                if (max_blocks > DIO_MAX_BLOCKS)
                        max_blocks = DIO_MAX_BLOCKS;
-                handle = ext4_journal_start(inode, DIO_CREDITS +
+                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                              2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
@@ -1336,12 +1421,8 @@ static int ext4_ordered_write_end(struct file *file,
 {
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
-        unsigned from, to;
        int ret = 0, ret2;
-        from = pos & (PAGE_CACHE_SIZE - 1);
-        to = from + len;
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
@@ -1437,36 +1518,6 @@ static int ext4_journalled_write_end(struct file *file,
        return ret ? ret : copied;
 }
-/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
- */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
-{
-        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-        int ind_blks, dind_blks, tind_blks;
-        /* number of new indirect blocks needed */
-        ind_blks = (blocks + icap - 1) / icap;
-        dind_blks = (ind_blks + icap - 1) / icap;
-        tind_blks = 1;
-        return ind_blks + dind_blks + tind_blks;
-}
-/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
- */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
-{
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-                return ext4_ext_calc_metadata_amount(inode, blocks);
-        return ext4_indirect_calc_metadata_amount(inode, blocks);
-}
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
@@ -1490,7 +1541,6 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
                return -ENOSPC;
        }
        /* reduce fs free blocks counter */
        percpu_counter_sub(&sbi->s_freeblocks_counter, total);
@@ -1501,35 +1551,49 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
        return 0;       /* success */
 }
-void ext4_da_release_space(struct inode *inode, int used, int to_free)
+static void ext4_da_release_space(struct inode *inode, int to_free)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int total, mdb, mdb_free, release;
+        if (!to_free)
+                return;         /* Nothing to release, exit */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        if (!EXT4_I(inode)->i_reserved_data_blocks) {
+                /*
+                 * if there is no reserved blocks, but we try to free some
+                 * then the counter is messed up somewhere.
+                 * but since this function is called from invalidate
+                 * page, it's harmless to return without any action
+                 */
+                printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+                            "blocks for inode %lu, but there is no reserved "
+                            "data blocks\n", to_free, inode->i_ino);
+                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                return;
+        }
        /* recalculate the number of metablocks still need to be reserved */
-        total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+        total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
        mdb = ext4_calc_metadata_amount(inode, total);
        /* figure out how many metablocks to release */
        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-        /* Account for allocated meta_blocks */
-        mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
        release = to_free + mdb_free;
        /* update fs free blocks counter for truncate case */
        percpu_counter_add(&sbi->s_freeblocks_counter, release);
        /* update per-inode reservations */
-        BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
-        EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+        EXT4_I(inode)->i_reserved_data_blocks -= to_free;
        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-        EXT4_I(inode)->i_allocated_meta_blocks = 0;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
@@ -1551,7 +1615,7 @@ static void ext4_da_page_release_reservation(struct page *page,
                }
                curr_off = next_off;
        } while ((bh = bh->b_this_page) != head);
-        ext4_da_release_space(page->mapping->host, 0, to_release);
+        ext4_da_release_space(page->mapping->host, to_release);
 }
 /*
@@ -1564,11 +1628,13 @@ struct mpage_da_data {
        unsigned long first_page, next_page;    /* extent of pages */
        get_block_t *get_block;
        struct writeback_control *wbc;
+        int io_done;
+        long pages_written;
 };
 /*
 * mpage_da_submit_io - walks through extent of pages and try to write
- * them with __mpage_writepage()
+ * them with writepage() call back
 *
 * @mpd->inode: inode
 * @mpd->first_page: first page of the extent
@@ -1583,18 +1649,11 @@ struct mpage_da_data {
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
        struct address_space *mapping = mpd->inode->i_mapping;
-        struct mpage_data mpd_pp = {
-                .bio = NULL,
-                .last_block_in_bio = 0,
-                .get_block = mpd->get_block,
-                .use_writepage = 1,
-        };
        int ret = 0, err, nr_pages, i;
        unsigned long index, end;
        struct pagevec pvec;
        BUG_ON(mpd->next_page <= mpd->first_page);
        pagevec_init(&pvec, 0);
        index = mpd->first_page;
        end = mpd->next_page - 1;
@@ -1612,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                                break;
                        index++;
-                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        err = mapping->a_ops->writepage(page, mpd->wbc);
+                        if (!err)
+                                mpd->pages_written++;
                        /*
                         * In error case, we have to continue because
                         * remaining pages are still locked
@@ -1624,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
                }
                pagevec_release(&pvec);
        }
-        if (mpd_pp.bio)
-                mpage_bio_submit(WRITE, mpd_pp.bio);
        return ret;
 }
@@ -1649,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
        int blocks = exbh->b_size >> inode->i_blkbits;
        sector_t pblock = exbh->b_blocknr, cur_logical;
        struct buffer_head *head, *bh;
-        unsigned long index, end;
+        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
@@ -1692,6 +1749,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                if (buffer_delay(bh)) {
                                        bh->b_blocknr = pblock;
                                        clear_buffer_delay(bh);
+                                        bh->b_bdev = inode->i_sb->s_bdev;
+                                } else if (buffer_unwritten(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_unwritten(bh);
+                                        set_buffer_mapped(bh);
+                                        set_buffer_new(bh);
+                                        bh->b_bdev = inode->i_sb->s_bdev;
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
@@ -1727,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
 *
 * The function skips space we know is already mapped to disk blocks.
 *
- * The function ignores errors ->get_block() returns, thus real
- * error handling is postponed to __mpage_writepage()
 */
 static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
+        int err = 0;
        struct buffer_head *lbh = &mpd->lbh;
-        int err = 0, remain = lbh->b_size;
        sector_t next = lbh->b_blocknr;
        struct buffer_head new;
@@ -1743,38 +1805,36 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
        if (buffer_mapped(lbh) && !buffer_delay(lbh))
                return;
-        while (remain) {
+        new.b_state = lbh->b_state;
-                new.b_state = lbh->b_state;
+        new.b_blocknr = 0;
-                new.b_blocknr = 0;
+        new.b_size = lbh->b_size;
-                new.b_size = remain;
-                err = mpd->get_block(mpd->inode, next, &new, 1);
-                if (err) {
-                        /*
-                         * Rather than implement own error handling
-                         * here, we just leave remaining blocks
-                         * unallocated and try again with ->writepage()
-                         */
-                        break;
-                }
-                BUG_ON(new.b_size == 0);
-                if (buffer_new(&new))
+        /*
-                        __unmap_underlying_blocks(mpd->inode, &new);
+         * If we didn't accumulate anything
+         * to write simply return
+         */
+        if (!new.b_size)
+                return;
+        err = mpd->get_block(mpd->inode, next, &new, 1);
+        if (err)
+                return;
+        BUG_ON(new.b_size == 0);
-                /*
+        if (buffer_new(&new))
-                 * If blocks are delayed marked, we need to
+                __unmap_underlying_blocks(mpd->inode, &new);
-                 * put actual blocknr and drop delayed bit
-                 */
-                if (buffer_delay(lbh))
-                        mpage_put_bnr_to_bhs(mpd, next, &new);
-                /* go for the remaining blocks */
+        /*
-                next += new.b_size >> mpd->inode->i_blkbits;
+         * If blocks are delayed marked, we need to
-                remain -= new.b_size;
+         * put actual blocknr and drop delayed bit
-        }
+         */
+        if (buffer_delay(lbh) || buffer_unwritten(lbh))
+                mpage_put_bnr_to_bhs(mpd, next, &new);
+        return;
 }
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+                (1 << BH_Delay) | (1 << BH_Unwritten))
 /*
 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1788,41 +1848,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
                                   sector_t logical, struct buffer_head *bh)
 {
-        struct buffer_head *lbh = &mpd->lbh;
        sector_t next;
+        size_t b_size = bh->b_size;
+        struct buffer_head *lbh = &mpd->lbh;
+        int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
-        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /* check if thereserved journal credits might overflow */
+        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+                        /*
+                         * With non-extent format we are limited by the journal
+                         * credit available.  Total credit needed to insert
+                         * nrblocks contiguous blocks is dependent on the
+                         * nrblocks.  So limit nrblocks.
+                         */
+                        goto flush_it;
+                } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+                                EXT4_MAX_TRANS_DATA) {
+                        /*
+                         * Adding the new buffer_head would make it cross the
+                         * allowed limit for which we have journal credit
+                         * reserved. So limit the new bh->b_size
+                         */
+                        b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+                                                mpd->inode->i_blkbits;
+                        /* we will do mpage_da_submit_io in the next loop */
+                }
+        }
        /*
         * First block in the extent
         */
        if (lbh->b_size == 0) {
                lbh->b_blocknr = logical;
-                lbh->b_size = bh->b_size;
+                lbh->b_size = b_size;
                lbh->b_state = bh->b_state & BH_FLAGS;
                return;
        }
+        next = lbh->b_blocknr + nrblocks;
        /*
         * Can we merge the block to our big extent?
         */
        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-                lbh->b_size += bh->b_size;
+                lbh->b_size += b_size;
                return;
        }
+flush_it:
        /*
         * We couldn't merge the block to our extent, so we
         * need to flush current  extent and start new one
         */
        mpage_da_map_blocks(mpd);
+        mpage_da_submit_io(mpd);
-        /*
+        mpd->io_done = 1;
-         * Now start a new extent
+        return;
-         */
-        lbh->b_size = bh->b_size;
-        lbh->b_state = bh->b_state & BH_FLAGS;
-        lbh->b_blocknr = logical;
 }
 /*
@@ -1842,17 +1922,35 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head, fake;
        sector_t logical;
+        if (mpd->io_done) {
+                /*
+                 * Rest of the page in the page_vec
+                 * redirty then and skip then. We will
+                 * try to to write them again after
+                 * starting a new transaction
+                 */
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return MPAGE_DA_EXTENT_TAIL;
+        }
        /*
         * Can we merge this page to current extent?
         */
        if (mpd->next_page != page->index) {
                /*
                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them using __mpage_writepage()
+                 * and start IO on them using writepage()
                 */
                if (mpd->next_page != mpd->first_page) {
                        mpage_da_map_blocks(mpd);
                        mpage_da_submit_io(mpd);
+                        /*
+                         * skip rest of the page in the page_vec
+                         */
+                        mpd->io_done = 1;
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        return MPAGE_DA_EXTENT_TAIL;
                }
                /*
@@ -1883,6 +1981,8 @@ static int __mpage_da_writepage(struct page *page,
                set_buffer_dirty(bh);
                set_buffer_uptodate(bh);
                mpage_add_bh_to_extent(mpd, logical, bh);
+                if (mpd->io_done)
+                        return MPAGE_DA_EXTENT_TAIL;
        } else {
                /*
                 * Page with regular buffer heads, just add all dirty ones
@@ -1891,8 +1991,12 @@ static int __mpage_da_writepage(struct page *page,
                bh = head;
                do {
                        BUG_ON(buffer_locked(bh));
-                        if (buffer_dirty(bh))
+                        if (buffer_dirty(bh) &&
+                                (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical, bh);
+                                if (mpd->io_done)
+                                        return MPAGE_DA_EXTENT_TAIL;
+                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
        }
@@ -1911,22 +2015,13 @@ static int __mpage_da_writepage(struct page *page,
 *
 * This is a library function, which implements the writepages()
 * address_space_operation.
- *
- * In order to avoid duplication of logic that deals with partial pages,
- * multiple bio per page, etc, we find non-allocated blocks, allocate
- * them with minimal calls to ->get_block() and re-use __mpage_writepage()
- *
- * It's important that we call __mpage_writepage() only once for each
- * involved page, otherwise we'd have to implement more complicated logic
- * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
- *
- * See comments to mpage_writepages()
 */
 static int mpage_da_writepages(struct address_space *mapping,
                               struct writeback_control *wbc,
                               get_block_t get_block)
 {
        struct mpage_da_data mpd;
+        long to_write;
        int ret;
        if (!get_block)
@@ -1940,17 +2035,22 @@ static int mpage_da_writepages(struct address_space *mapping,
        mpd.first_page = 0;
        mpd.next_page = 0;
        mpd.get_block = get_block;
+        mpd.io_done = 0;
+        mpd.pages_written = 0;
+        to_write = wbc->nr_to_write;
        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
        /*
         * Handle last extent of pages
         */
-        if (mpd.next_page != mpd.first_page) {
+        if (!mpd.io_done && mpd.next_page != mpd.first_page) {
                mpage_da_map_blocks(&mpd);
                mpage_da_submit_io(&mpd);
        }
+        wbc->nr_to_write = to_write - mpd.pages_written;
        return ret;
 }
@@ -2155,63 +2255,95 @@ static int ext4_da_writepage(struct page *page,
 }
 /*
- * For now just follow the DIO way to estimate the max credits
+ * This is called via ext4_da_writepages() to
- * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * calulate the total number of credits to reserve to fit
- * todo: need to calculate the max credits need for
+ * a single extent allocation into a single transaction,
- * extent based files, currently the DIO credits is based on
+ * ext4_da_writpeages() will loop calling this before
- * indirect-blocks mapping way.
+ * the block allocation.
- *
- * Probably should have a generic way to calculate credits
- * for DIO, writepages, and truncate
 */
-#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
-#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+        int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+        /*
+         * With non-extent format the journal credit needed to
+         * insert nrblocks contiguous block is dependent on
+         * number of contiguous block. So we will limit
+         * number of contiguous block to a sane value
+         */
+        if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+            (max_blocks > EXT4_MAX_TRANS_DATA))
+                max_blocks = EXT4_MAX_TRANS_DATA;
+        return ext4_chunk_trans_blocks(inode, max_blocks);
+}
 static int ext4_da_writepages(struct address_space *mapping,
-                                struct writeback_control *wbc)
+                              struct writeback_control *wbc)
 {
-        struct inode *inode = mapping->host;
        handle_t *handle = NULL;
-        int needed_blocks;
-        int ret = 0;
-        long to_write;
        loff_t range_start = 0;
+        struct inode *inode = mapping->host;
+        int needed_blocks, ret = 0, nr_to_writebump = 0;
+        long to_write, pages_skipped = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
-        if (!mapping->nrpages)
+        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
        /*
-         * Estimate the worse case needed credits to write out
+         * Make sure nr_to_write is >= sbi->s_mb_stream_request
-         * EXT4_MAX_BUF_BLOCKS pages
+         * This make sure small files blocks are allocated in
+         * single attempt. This ensure that small files
+         * get less fragmented.
         */
-        needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+        if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+                nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+                wbc->nr_to_write = sbi->s_mb_stream_request;
+        }
-        to_write = wbc->nr_to_write;
+        if (!wbc->range_cyclic)
-        if (!wbc->range_cyclic) {
                /*
                 * If range_cyclic is not set force range_cont
                 * and save the old writeback_index
                 */
                wbc->range_cont = 1;
-                range_start =  wbc->range_start;
-        }
-        while (!ret && to_write) {
+        range_start =  wbc->range_start;
+        pages_skipped = wbc->pages_skipped;
+restart_loop:
+        to_write = wbc->nr_to_write;
+        while (!ret && to_write > 0) {
+                /*
+                 * we  insert one extent at a time. So we need
+                 * credit needed for single extent allocation.
+                 * journalled mode is currently not supported
+                 * by delalloc
+                 */
+                BUG_ON(ext4_should_journal_data(inode));
+                needed_blocks = ext4_da_writepages_trans_blocks(inode);
                /* start a new transaction*/
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
+                        printk(KERN_EMERG "%s: jbd2_start: "
+                               "%ld pages, ino %lu; err %d\n", __func__,
+                                wbc->nr_to_write, inode->i_ino, ret);
+                        dump_stack();
                        goto out_writepages;
                }
                if (ext4_should_order_data(inode)) {
                        /*
                         * With ordered mode we need to add
-                         * the inode to the journal handle
+                         * the inode to the journal handl
                         * when we do block allocation.
                         */
                        ret = ext4_jbd2_file_inode(handle, inode);
@@ -2219,20 +2351,20 @@ static int ext4_da_writepages(struct address_space *mapping,
                                ext4_journal_stop(handle);
                                goto out_writepages;
                        }
                }
-                /*
-                 * set the max dirty pages could be write at a time
-                 * to fit into the reserved transaction credits
-                 */
-                if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
-                        wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
                to_write -= wbc->nr_to_write;
                ret = mpage_da_writepages(mapping, wbc,
-                                                ext4_da_get_block_write);
+                                          ext4_da_get_block_write);
                ext4_journal_stop(handle);
-                if (wbc->nr_to_write) {
+                if (ret == MPAGE_DA_EXTENT_TAIL) {
+                        /*
+                         * got one extent now try with
+                         * rest of the pages
+                         */
+                        to_write += wbc->nr_to_write;
+                        ret = 0;
+                } else if (wbc->nr_to_write) {
                        /*
                         * There is no more writeout needed
                         * or we requested for a noblocking writeout
@@ -2244,10 +2376,18 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = to_write;
        }
-out_writepages:
+        if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-        wbc->nr_to_write = to_write;
+                /* We skipped pages in this loop */
-        if (range_start)
                wbc->range_start = range_start;
+                wbc->nr_to_write = to_write +
+                                wbc->pages_skipped - pages_skipped;
+                wbc->pages_skipped = pages_skipped;
+                goto restart_loop;
+        }
+out_writepages:
+        wbc->nr_to_write = to_write - nr_to_writebump;
+        wbc->range_start = range_start;
        return ret;
 }
@@ -2280,8 +2420,11 @@ retry:
        }
        page = __grab_cache_page(mapping, index);
-        if (!page)
+        if (!page) {
-                return -ENOMEM;
+                ext4_journal_stop(handle);
+                ret = -ENOMEM;
+                goto out;
+        }
        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
@@ -2806,59 +2949,63 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 }
 static const struct address_space_operations ext4_ordered_aops = {
-        .readpage       = ext4_readpage,
+        .readpage               = ext4_readpage,
-        .readpages      = ext4_readpages,
+        .readpages              = ext4_readpages,
-        .writepage      = ext4_normal_writepage,
+        .writepage              = ext4_normal_writepage,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext4_write_begin,
+        .write_begin            = ext4_write_begin,
-        .write_end      = ext4_ordered_write_end,
+        .write_end              = ext4_ordered_write_end,
-        .bmap           = ext4_bmap,
+        .bmap                   = ext4_bmap,
-        .invalidatepage = ext4_invalidatepage,
+        .invalidatepage         = ext4_invalidatepage,
-        .releasepage    = ext4_releasepage,
+        .releasepage            = ext4_releasepage,
-        .direct_IO      = ext4_direct_IO,
+        .direct_IO              = ext4_direct_IO,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 static const struct address_space_operations ext4_writeback_aops = {
-        .readpage       = ext4_readpage,
+        .readpage               = ext4_readpage,
-        .readpages      = ext4_readpages,
+        .readpages              = ext4_readpages,
-        .writepage      = ext4_normal_writepage,
+        .writepage              = ext4_normal_writepage,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext4_write_begin,
+        .write_begin            = ext4_write_begin,
-        .write_end      = ext4_writeback_write_end,
+        .write_end              = ext4_writeback_write_end,
-        .bmap           = ext4_bmap,
+        .bmap                   = ext4_bmap,
-        .invalidatepage = ext4_invalidatepage,
+        .invalidatepage         = ext4_invalidatepage,
-        .releasepage    = ext4_releasepage,
+        .releasepage            = ext4_releasepage,
-        .direct_IO      = ext4_direct_IO,
+        .direct_IO              = ext4_direct_IO,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 static const struct address_space_operations ext4_journalled_aops = {
-        .readpage       = ext4_readpage,
+        .readpage               = ext4_readpage,
-        .readpages      = ext4_readpages,
+        .readpages              = ext4_readpages,
-        .writepage      = ext4_journalled_writepage,
+        .writepage              = ext4_journalled_writepage,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext4_write_begin,
+        .write_begin            = ext4_write_begin,
-        .write_end      = ext4_journalled_write_end,
+        .write_end              = ext4_journalled_write_end,
-        .set_page_dirty = ext4_journalled_set_page_dirty,
+        .set_page_dirty         = ext4_journalled_set_page_dirty,
-        .bmap           = ext4_bmap,
+        .bmap                   = ext4_bmap,
-        .invalidatepage = ext4_invalidatepage,
+        .invalidatepage         = ext4_invalidatepage,
-        .releasepage    = ext4_releasepage,
+        .releasepage            = ext4_releasepage,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 static const struct address_space_operations ext4_da_aops = {
-        .readpage       = ext4_readpage,
+        .readpage               = ext4_readpage,
-        .readpages      = ext4_readpages,
+        .readpages              = ext4_readpages,
-        .writepage      = ext4_da_writepage,
+        .writepage              = ext4_da_writepage,
-        .writepages     = ext4_da_writepages,
+        .writepages             = ext4_da_writepages,
-        .sync_page      = block_sync_page,
+        .sync_page              = block_sync_page,
-        .write_begin    = ext4_da_write_begin,
+        .write_begin            = ext4_da_write_begin,
-        .write_end      = ext4_da_write_end,
+        .write_end              = ext4_da_write_end,
-        .bmap           = ext4_bmap,
+        .bmap                   = ext4_bmap,
-        .invalidatepage = ext4_da_invalidatepage,
+        .invalidatepage         = ext4_da_invalidatepage,
-        .releasepage    = ext4_releasepage,
+        .releasepage            = ext4_releasepage,
-        .direct_IO      = ext4_direct_IO,
+        .direct_IO              = ext4_direct_IO,
-        .migratepage    = buffer_migrate_page,
+        .migratepage            = buffer_migrate_page,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 void ext4_set_aops(struct inode *inode)
@@ -3430,6 +3577,9 @@ void ext4_truncate(struct inode *inode)
         * modify the block allocation tree.
         */
        down_write(&ei->i_data_sem);
+        ext4_discard_reservation(inode);
        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
@@ -3499,8 +3649,6 @@ do_indirects:
                ;
        }
-        ext4_discard_reservation(inode);
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -3586,6 +3734,16 @@ static int __ext4_get_inode_loc(struct inode *inode,
        }
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
+                /*
+                 * If the buffer has the write error flag, we have failed
+                 * to write out another inode in the same block.  In this
+                 * case, we don't have to read the block because we may
+                 * read the old inode data successfully.
+                 */
+                if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+                        set_buffer_uptodate(bh);
                if (buffer_uptodate(bh)) {
                        /* someone brought it uptodate while we waited */
                        unlock_buffer(bh);
@@ -4258,57 +4416,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
+                                      int chunk)
+{
+        int indirects;
+        /* if nrblocks are contiguous */
+        if (chunk) {
+                /*
+                 * With N contiguous data blocks, it need at most
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * 2 dindirect blocks
+                 * 1 tindirect block
+                 */
+                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return indirects + 3;
+        }
+        /*
+         * if nrblocks are not contiguous, worse case, each block touch
+         * a indirect block, and each indirect block touch a double indirect
+         * block, plus a triple indirect block
+         */
+        indirects = nrblocks * 2 + 1;
+        return indirects;
+}
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                return ext4_indirect_trans_blocks(inode, nrblocks, 0);
+        return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+}
 /*
- * How many blocks doth make a writepage()?
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
 *
- * With N blocks per page, it may be:
+ * If datablocks are discontiguous, they are possible to spread over
- * N data blocks
+ * different block groups too. If they are contiugous, with flexbg,
- * 2 indirect block
+ * they could still across block group boundary.
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
 *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ * Also account for superblock, inode, quota and xattr blocks
- *
+ */
- * With ordered or writeback data it's the same, less the N data blocks.
+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+        int groups, gdpblocks;
+        int idxblocks;
+        int ret = 0;
+        /*
+         * How many index blocks need to touch to modify nrblocks?
+         * The "Chunk" flag indicating whether the nrblocks is
+         * physically contiguous on disk
+         *
+         * For Direct IO and fallocate, they calls get_block to allocate
+         * one single extent at a time, so they could set the "Chunk" flag
+         */
+        idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+        ret = idxblocks;
+        /*
+         * Now let's see how many group bitmaps and group descriptors need
+         * to account
+         */
+        groups = idxblocks;
+        if (chunk)
+                groups += 1;
+        else
+                groups += nrblocks;
+        gdpblocks = groups;
+        if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+                groups = EXT4_SB(inode->i_sb)->s_groups_count;
+        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+        /* bitmaps and block group descriptor blocks */
+        ret += groups + gdpblocks;
+        /* Blocks for super block, inode, quota and xattr blocks */
+        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+        return ret;
+}
+/*
+ * Calulate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
 *
- * If the inode's direct blocks can hold an integral number of pages then a
+ * This could be called via ext4_write_begin()
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
 *
- * This still overestimates under most circumstances.  If we were to pass the
+ * We need to consider the worse case, when
- * start and end offsets in here as well we could do block_to_path() on each
+ * one new block per extent.
- * block and work out the exact number of indirects which are touched.  Pah.
 */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
        int bpp = ext4_journal_blocks_per_page(inode);
-        int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
        int ret;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        ret = ext4_meta_trans_blocks(inode, bpp, 0);
-                return ext4_ext_writepage_trans_blocks(inode, bpp);
+        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
-                ret = 3 * (bpp + indirects) + 2;
+                ret += bpp;
-        else
-                ret = 2 * (bpp + indirects) + 2;
-#ifdef CONFIG_QUOTA
-        /* We know that structure was already allocated during DQUOT_INIT so
-         * we will be updating only the data blocks + inodes */
-        ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
        return ret;
 }
 /*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+        return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d141a25bbee..e0e3a5eb1ddb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (bh_uptodate_or_lock(bh[i]))
                        continue;
+                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
                        set_buffer_uptodate(bh[i]);
                        unlock_buffer(bh[i]);
+                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                        continue;
                }
+                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                get_bh(bh[i]);
                bh[i]->b_end_io = end_buffer_read_sync;
                submit_bh(READ, bh[i]);
@@ -2477,7 +2480,7 @@ err_freesgi:
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        unsigned i;
+        unsigned i, j;
        unsigned offset;
        unsigned max;
        int ret;
@@ -2537,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
-        i = sizeof(struct ext4_locality_group) * NR_CPUS;
+        i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
        sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
        if (sbi->s_locality_groups == NULL) {
                clear_opt(sbi->s_mount_opt, MBALLOC);
@@ -2545,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                kfree(sbi->s_mb_maxs);
                return -ENOMEM;
        }
-        for (i = 0; i < NR_CPUS; i++) {
+        for (i = 0; i < nr_cpu_ids; i++) {
                struct ext4_locality_group *lg;
                lg = &sbi->s_locality_groups[i];
                mutex_init(&lg->lg_mutex);
-                INIT_LIST_HEAD(&lg->lg_prealloc_list);
+                for (j = 0; j < PREALLOC_TB_SIZE; j++)
+                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
                spin_lock_init(&lg->lg_prealloc_lock);
        }
@@ -3260,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
 {
        unsigned int len = ac->ac_o_ex.fe_len;
        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
                                        &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
@@ -3277,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 }
 /*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+                        struct ext4_prealloc_space *pa,
+                        struct ext4_prealloc_space *cpa)
+{
+        ext4_fsblk_t cur_distance, new_distance;
+        if (cpa == NULL) {
+                atomic_inc(&pa->pa_count);
+                return pa;
+        }
+        cur_distance = abs(goal_block - cpa->pa_pstart);
+        new_distance = abs(goal_block - pa->pa_pstart);
+        if (cur_distance < new_distance)
+                return cpa;
+        /* drop the previous reference */
+        atomic_dec(&cpa->pa_count);
+        atomic_inc(&pa->pa_count);
+        return pa;
+}
+/*
 * search goal blocks in preallocated space
 */
 static noinline_for_stack int
 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
+        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
-        struct ext4_prealloc_space *pa;
+        struct ext4_prealloc_space *pa, *cpa = NULL;
+        ext4_fsblk_t goal_block;
        /* only data can be preallocated */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3322,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
        lg = ac->ac_lg;
        if (lg == NULL)
                return 0;
+        order  = fls(ac->ac_o_ex.fe_len) - 1;
+        if (order > PREALLOC_TB_SIZE - 1)
+                /* The max size of hash table is PREALLOC_TB_SIZE */
+                order = PREALLOC_TB_SIZE - 1;
+        goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+                     ac->ac_g_ex.fe_start +
+                     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+        /*
+         * search for the prealloc space that is having
+         * minimal distance from the goal block.
+         */
+        for (i = order; i < PREALLOC_TB_SIZE; i++) {
+                rcu_read_lock();
+                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+                                        pa_inode_list) {
+                        spin_lock(&pa->pa_lock);
+                        if (pa->pa_deleted == 0 &&
+                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
-        rcu_read_lock();
+                                cpa = ext4_mb_check_group_pa(goal_block,
-        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
+                                                                pa, cpa);
-                spin_lock(&pa->pa_lock);
+                        }
-                if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
-                        atomic_inc(&pa->pa_count);
-                        ext4_mb_use_group_pa(ac, pa);
                        spin_unlock(&pa->pa_lock);
-                        ac->ac_criteria = 20;
-                        rcu_read_unlock();
-                        return 1;
                }
-                spin_unlock(&pa->pa_lock);
+                rcu_read_unlock();
+        }
+        if (cpa) {
+                ext4_mb_use_group_pa(ac, cpa);
+                ac->ac_criteria = 20;
+                return 1;
        }
-        rcu_read_unlock();
        return 0;
 }
@@ -3560,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_free = pa->pa_len;
        atomic_set(&pa->pa_count, 1);
        spin_lock_init(&pa->pa_lock);
+        INIT_LIST_HEAD(&pa->pa_inode_list);
        pa->pa_deleted = 0;
        pa->pa_linear = 1;
@@ -3580,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
-        spin_lock(pa->pa_obj_lock);
+        /*
-        list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
+         * We will later add the new pa to the right bucket
-        spin_unlock(pa->pa_obj_lock);
+         * after updating the pa_free in ext4_mb_release_context
+         */
        return 0;
 }
@@ -3733,20 +3786,23 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
-                /* error handling here */
+                ext4_error(sb, __func__, "Error in reading block "
-                ext4_mb_release_desc(&e4b);
+                                "bitmap for %lu\n", group);
-                BUG_ON(bitmap_bh == NULL);
+                return 0;
        }
        err = ext4_mb_load_buddy(sb, group, &e4b);
-        BUG_ON(err != 0); /* error handling here */
+        if (err) {
+                ext4_error(sb, __func__, "Error in loading buddy "
+                                "information for %lu\n", group);
+                put_bh(bitmap_bh);
+                return 0;
+        }
        if (needed == 0)
                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
-        grp = ext4_get_group_info(sb, group);
        INIT_LIST_HEAD(&list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 repeat:
        ext4_lock_group(sb, group);
@@ -3903,13 +3959,18 @@ repeat:
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                err = ext4_mb_load_buddy(sb, group, &e4b);
-                BUG_ON(err != 0); /* error handling here */
+                if (err) {
+                        ext4_error(sb, __func__, "Error in loading buddy "
+                                        "information for %lu\n", group);
+                        continue;
+                }
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
-                        /* error handling here */
+                        ext4_error(sb, __func__, "Error in reading block "
+                                        "bitmap for %lu\n", group);
                        ext4_mb_release_desc(&e4b);
-                        BUG_ON(bitmap_bh == NULL);
+                        continue;
                }
                ext4_lock_group(sb, group);
@@ -4112,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 }
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+                                        struct ext4_locality_group *lg,
+                                        int order, int total_entries)
+{
+        ext4_group_t group = 0;
+        struct ext4_buddy e4b;
+        struct list_head discard_list;
+        struct ext4_prealloc_space *pa, *tmp;
+        struct ext4_allocation_context *ac;
+        mb_debug("discard locality group preallocation\n");
+        INIT_LIST_HEAD(&discard_list);
+        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+        spin_lock(&lg->lg_prealloc_lock);
+        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+                                                pa_inode_list) {
+                spin_lock(&pa->pa_lock);
+                if (atomic_read(&pa->pa_count)) {
+                        /*
+                         * This is the pa that we just used
+                         * for block allocation. So don't
+                         * free that
+                         */
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                if (pa->pa_deleted) {
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                /* only lg prealloc space */
+                BUG_ON(!pa->pa_linear);
+                /* seems this one can be freed ... */
+                pa->pa_deleted = 1;
+                spin_unlock(&pa->pa_lock);
+                list_del_rcu(&pa->pa_inode_list);
+                list_add(&pa->u.pa_tmp_list, &discard_list);
+                total_entries--;
+                if (total_entries <= 5) {
+                        /*
+                         * we want to keep only 5 entries
+                         * allowing it to grow to 8. This
+                         * mak sure we don't call discard
+                         * soon for this list.
+                         */
+                        break;
+                }
+        }
+        spin_unlock(&lg->lg_prealloc_lock);
+        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+                if (ext4_mb_load_buddy(sb, group, &e4b)) {
+                        ext4_error(sb, __func__, "Error in loading buddy "
+                                        "information for %lu\n", group);
+                        continue;
+                }
+                ext4_lock_group(sb, group);
+                list_del(&pa->pa_group_list);
+                ext4_mb_release_group_pa(&e4b, pa, ac);
+                ext4_unlock_group(sb, group);
+                ext4_mb_release_desc(&e4b);
+                list_del(&pa->u.pa_tmp_list);
+                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+        }
+        if (ac)
+                kmem_cache_free(ext4_ac_cachep, ac);
+}
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+        int order, added = 0, lg_prealloc_count = 1;
+        struct super_block *sb = ac->ac_sb;
+        struct ext4_locality_group *lg = ac->ac_lg;
+        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+        order = fls(pa->pa_free) - 1;
+        if (order > PREALLOC_TB_SIZE - 1)
+                /* The max size of hash table is PREALLOC_TB_SIZE */
+                order = PREALLOC_TB_SIZE - 1;
+        /* Add the prealloc space to lg */
+        rcu_read_lock();
+        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+                                                pa_inode_list) {
+                spin_lock(&tmp_pa->pa_lock);
+                if (tmp_pa->pa_deleted) {
+                        spin_unlock(&pa->pa_lock);
+                        continue;
+                }
+                if (!added && pa->pa_free < tmp_pa->pa_free) {
+                        /* Add to the tail of the previous entry */
+                        list_add_tail_rcu(&pa->pa_inode_list,
+                                                &tmp_pa->pa_inode_list);
+                        added = 1;
+                        /*
+                         * we want to count the total
+                         * number of entries in the list
+                         */
+                }
+                spin_unlock(&tmp_pa->pa_lock);
+                lg_prealloc_count++;
+        }
+        if (!added)
+                list_add_tail_rcu(&pa->pa_inode_list,
+                                        &lg->lg_prealloc_list[order]);
+        rcu_read_unlock();
+        /* Now trim the list to be not more than 8 elements */
+        if (lg_prealloc_count > 8) {
+                ext4_mb_discard_lg_preallocations(sb, lg,
+                                                order, lg_prealloc_count);
+                return;
+        }
+        return ;
+}
 /*
 * release all resource we used in allocation
 */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
-        if (ac->ac_pa) {
+        struct ext4_prealloc_space *pa = ac->ac_pa;
-                if (ac->ac_pa->pa_linear) {
+        if (pa) {
+                if (pa->pa_linear) {
                        /* see comment in ext4_mb_use_group_pa() */
-                        spin_lock(&ac->ac_pa->pa_lock);
+                        spin_lock(&pa->pa_lock);
-                        ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
+                        pa->pa_pstart += ac->ac_b_ex.fe_len;
-                        ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
+                        pa->pa_lstart += ac->ac_b_ex.fe_len;
-                        ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
+                        pa->pa_free -= ac->ac_b_ex.fe_len;
-                        ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
+                        pa->pa_len -= ac->ac_b_ex.fe_len;
-                        spin_unlock(&ac->ac_pa->pa_lock);
+                        spin_unlock(&pa->pa_lock);
+                        /*
+                         * We want to add the pa to the right bucket.
+                         * Remove it from the list and while adding
+                         * make sure the list to which we are adding
+                         * doesn't grow big.
+                         */
+                        if (likely(pa->pa_free)) {
+                                spin_lock(pa->pa_obj_lock);
+                                list_del_rcu(&pa->pa_inode_list);
+                                spin_unlock(pa->pa_obj_lock);
+                                ext4_mb_add_n_trim(ac);
+                        }
                }
-                ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_page)
                page_cache_release(ac->ac_bitmap_page);
@@ -4420,11 +4627,15 @@ do_more:
                count -= overflow;
        }
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-        if (!bitmap_bh)
+        if (!bitmap_bh) {
+                err = -EIO;
                goto error_return;
+        }
        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
-        if (!gdp)
+        if (!gdp) {
+                err = -EIO;
                goto error_return;
+        }
        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index bfe6add46bcf..c7c9906c2a75 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -164,11 +164,17 @@ struct ext4_free_extent {
 * Locality group:
 *   we try to group all related changes together
 *   so that writeback can flush/allocate them together as well
+ *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ *   (512). We store prealloc space into the hash based on the pa_free blocks
+ *   order value.ie, fls(pa_free)-1;
 */
+#define PREALLOC_TB_SIZE 10
 struct ext4_locality_group {
        /* for allocator */
-        struct mutex            lg_mutex;       /* to serialize allocates */
+        /* to serialize allocates */
-        struct list_head        lg_prealloc_list;/* list of preallocations */
+        struct mutex            lg_mutex;
+        /* list of preallocations */
+        struct list_head        lg_prealloc_list[PREALLOC_TB_SIZE];
        spinlock_t              lg_prealloc_lock;
 };
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e9..46fc0b5b12ba 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
         * credit. But below we try to not accumalate too much
         * of them by restarting the journal.
         */
-        needed = ext4_ext_calc_credits_for_insert(inode, path);
+        needed = ext4_ext_calc_credits_for_single_extent(inode,
+                    lb->last_block - lb->first_block + 1, path);
        /*
         * Make sure the credit we accumalated is not really high
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f000fbe2cd93..b3d35604ea18 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
                             "Inode bitmap not in group (block %llu)",
                             (unsigned long long)input->inode_bitmap);
        else if (outside(input->inode_table, start, end) ||
-                 outside(itend - 1, start, end))
+                 outside(itend - 1, start, end))
                ext4_warning(sb, __func__,
                             "Inode table not in group (blocks %llu-%llu)",
                             (unsigned long long)input->inode_table, itend - 1);
@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
                             (unsigned long long)input->inode_bitmap,
                             start, metaend - 1);
        else if (inside(input->inode_table, start, metaend) ||
-                 inside(itend - 1, start, metaend))
+                 inside(itend - 1, start, metaend))
                ext4_warning(sb, __func__,
                             "Inode table (%llu-%llu) overlaps"
                             "GDT table (%llu-%llu)",
@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
        if (err) {
                if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
                        return err;
-                if ((err = ext4_journal_get_write_access(handle, bh)))
+                if ((err = ext4_journal_get_write_access(handle, bh)))
                        return err;
-        }
+        }
        return 0;
 }
@@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
                       gdb_num);
-        /*
+        /*
-         * If we are not using the primary superblock/GDT copy don't resize,
+         * If we are not using the primary superblock/GDT copy don't resize,
-         * because the user tools have no way of handling this.  Probably a
+         * because the user tools have no way of handling this.  Probably a
-         * bad time to do it anyways.
+         * bad time to do it anyways.
-         */
+         */
        if (EXT4_SB(sb)->s_sbh->b_blocknr !=
            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
                ext4_warning(sb, __func__,
@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        return 0;
 exit_inode:
-        //ext4_journal_release_buffer(handle, iloc.bh);
+        /* ext4_journal_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
-        //ext4_journal_release_buffer(handle, dind);
+        /* ext4_journal_release_buffer(handle, dind); */
 exit_primary:
-        //ext4_journal_release_buffer(handle, *primary);
+        /* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-        //ext4_journal_release_buffer(handle, *primary);
+        /* ext4_journal_release_buffer(handle, *primary); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (reserved_gdb || gdb_off == 0) {
                if (!EXT4_HAS_COMPAT_FEATURE(sb,
-                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+                                             EXT4_FEATURE_COMPAT_RESIZE_INODE)
+                    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
                        ext4_warning(sb, __func__,
                                     "No reserved GDT blocks, can't resize");
                        return -EPERM;
@@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
                goto exit_journal;
-        /*
+        /*
-         * We will only either add reserved group blocks to a backup group
+         * We will only either add reserved group blocks to a backup group
-         * or remove reserved blocks for the first group in a new group block.
+         * or remove reserved blocks for the first group in a new group block.
-         * Doing both would be mean more complex code, and sane people don't
+         * Doing both would be mean more complex code, and sane people don't
-         * use non-sparse filesystems anymore.  This is already checked above.
+         * use non-sparse filesystems anymore.  This is already checked above.
-         */
+         */
        if (gdb_off) {
                primary = sbi->s_group_desc[gdb_num];
                if ((err = ext4_journal_get_write_access(handle, primary)))
@@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        } else if ((err = add_new_gdb(handle, inode, input, &primary)))
                goto exit_journal;
-        /*
+        /*
-         * OK, now we've set up the new group.  Time to make it active.
+         * OK, now we've set up the new group.  Time to make it active.
-         *
+         *
-         * Current kernels don't lock all allocations via lock_super(),
+         * Current kernels don't lock all allocations via lock_super(),
-         * so we have to be safe wrt. concurrent accesses the group
+         * so we have to be safe wrt. concurrent accesses the group
-         * data.  So we need to be careful to set all of the relevant
+         * data.  So we need to be careful to set all of the relevant
-         * group descriptor data etc. *before* we enable the group.
+         * group descriptor data etc. *before* we enable the group.
-         *
+         *
-         * The key field here is sbi->s_groups_count: as long as
+         * The key field here is sbi->s_groups_count: as long as
-         * that retains its old value, nobody is going to access the new
+         * that retains its old value, nobody is going to access the new
-         * group.
+         * group.
-         *
+         *
-         * So first we update all the descriptor metadata for the new
+         * So first we update all the descriptor metadata for the new
-         * group; then we update the total disk blocks count; then we
+         * group; then we update the total disk blocks count; then we
-         * update the groups count to enable the group; then finally we
+         * update the groups count to enable the group; then finally we
-         * update the free space counts so that the system can start
+         * update the free space counts so that the system can start
-         * using the new disk blocks.
+         * using the new disk blocks.
-         */
+         */
        /* Update group descriptor block for new group */
        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
@@ -946,7 +947,8 @@ exit_put:
        return err;
 } /* ext4_group_add */
-/* Extend the filesystem to the new number of blocks specified.  This entry
+/*
+ * Extend the filesystem to the new number of blocks specified.  This entry
 * point is only used to extend the current filesystem to the end of the last
 * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
 * for emergencies (because it has no dependencies on reserved blocks).
@@ -1024,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                             o_blocks_count + add, add);
        /* See if the device is actually as big as what was requested */
-        bh = sb_bread(sb, o_blocks_count + add -1);
+        bh = sb_bread(sb, o_blocks_count + add - 1);
        if (!bh) {
                ext4_warning(sb, __func__,
                             "can't read last block, resize aborted");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1cb371dcd609..566344b926b7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
                               unsigned int);
-static void ext4_commit_super (struct super_block * sb,
+static void ext4_commit_super(struct super_block *sb,
-                               struct ext4_super_block * es,
+                              struct ext4_super_block *es, int sync);
-                               int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
-static void ext4_mark_recovery_complete(struct super_block * sb,
+                                        struct ext4_super_block *es);
-                                        struct ext4_super_block * es);
+static void ext4_clear_journal_err(struct super_block *sb,
-static void ext4_clear_journal_err(struct super_block * sb,
+                                   struct ext4_super_block *es);
-                                   struct ext4_super_block * es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block * sb, int errno,
+static const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
-static int ext4_remount (struct super_block * sb, int * flags, char * data);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static void ext4_unlockfs(struct super_block *sb);
-static void ext4_write_super (struct super_block * sb);
+static void ext4_write_super(struct super_block *sb);
 static void ext4_write_super_lockfs(struct super_block *sb);
@@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb)
        if (sb->s_flags & MS_RDONLY)
                return;
-        if (!test_opt (sb, ERRORS_CONT)) {
+        if (!test_opt(sb, ERRORS_CONT)) {
                journal_t *journal = EXT4_SB(sb)->s_journal;
                EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
                if (journal)
                        jbd2_journal_abort(journal, -EIO);
        }
-        if (test_opt (sb, ERRORS_RO)) {
+        if (test_opt(sb, ERRORS_RO)) {
-                printk (KERN_CRIT "Remounting filesystem read-only\n");
+                printk(KERN_CRIT "Remounting filesystem read-only\n");
                sb->s_flags |= MS_RDONLY;
        }
        ext4_commit_super(sb, es, 1);
@@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb)
                        sb->s_id);
 }
-void ext4_error (struct super_block * sb, const char * function,
+void ext4_error(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
        va_list args;
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function,
        ext4_handle_error(sb);
 }
-static const char *ext4_decode_error(struct super_block * sb, int errno,
+static const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16])
 {
        char *errstr = NULL;
@@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
 /* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */
-void __ext4_std_error (struct super_block * sb, const char * function,
+void __ext4_std_error(struct super_block *sb, const char *function, int errno)
-                       int errno)
 {
        char nbuf[16];
        const char *errstr;
@@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
                return;
        errstr = ext4_decode_error(sb, errno, nbuf);
-        printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+        printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
-                sb->s_id, function, errstr);
+               sb->s_id, function, errstr);
        ext4_handle_error(sb);
 }
@@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
 * case we take the easy way out and panic immediately.
 */
-void ext4_abort (struct super_block * sb, const char * function,
+void ext4_abort(struct super_block *sb, const char *function,
-                 const char * fmt, ...)
+                const char *fmt, ...)
 {
        va_list args;
-        printk (KERN_CRIT "ext4_abort called.\n");
+        printk(KERN_CRIT "ext4_abort called.\n");
        va_start(args, fmt);
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
        vprintk(fmt, args);
        printk("\n");
        va_end(args);
@@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function,
        jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
-void ext4_warning (struct super_block * sb, const char * function,
+void ext4_warning(struct super_block *sb, const char *function,
-                   const char * fmt, ...)
+                  const char *fmt, ...)
 {
        va_list args;
@@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
        }
 }
-static void ext4_put_super (struct super_block * sb)
+static void ext4_put_super(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
@@ -570,6 +568,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #endif
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
+        ei->vfs_inode.i_data.writeback_index = 0;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
@@ -595,7 +594,7 @@ static void ext4_destroy_inode(struct inode *inode)
        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
@@ -647,7 +646,8 @@ static void ext4_clear_inode(struct inode *inode)
                                       &EXT4_I(inode)->jinode);
 }
-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
+static inline void ext4_show_quota_options(struct seq_file *seq,
+                                           struct super_block *sb)
 {
 #if defined(CONFIG_QUOTA)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -822,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 static int ext4_dquot_initialize(struct inode *inode, int type);
 static int ext4_dquot_drop(struct inode *inode);
@@ -991,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data)
        return sb_block;
 }
-static int parse_options (char *options, struct super_block *sb,
+static int parse_options(char *options, struct super_block *sb,
-                          unsigned int *inum, unsigned long *journal_devnum,
+                         unsigned int *inum, unsigned long *journal_devnum,
-                          ext4_fsblk_t *n_blocks_count, int is_remount)
+                         ext4_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        char * p;
+        char *p;
        substring_t args[MAX_OPT_ARGS];
        int data_opt = 0;
        int option;
@@ -1009,7 +1009,7 @@ static int parse_options (char *options, struct super_block *sb,
        if (!options)
                return 1;
-        while ((p = strsep (&options, ",")) != NULL) {
+        while ((p = strsep(&options, ",")) != NULL) {
                int token;
                if (!*p)
                        continue;
@@ -1017,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb,
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
-                        clear_opt (sbi->s_mount_opt, MINIX_DF);
+                        clear_opt(sbi->s_mount_opt, MINIX_DF);
                        break;
                case Opt_minix_df:
-                        set_opt (sbi->s_mount_opt, MINIX_DF);
+                        set_opt(sbi->s_mount_opt, MINIX_DF);
                        break;
                case Opt_grpid:
-                        set_opt (sbi->s_mount_opt, GRPID);
+                        set_opt(sbi->s_mount_opt, GRPID);
                        break;
                case Opt_nogrpid:
-                        clear_opt (sbi->s_mount_opt, GRPID);
+                        clear_opt(sbi->s_mount_opt, GRPID);
                        break;
                case Opt_resuid:
                        if (match_int(&args[0], &option))
@@ -1043,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb,
                        /* *sb_block = match_int(&args[0]); */
                        break;
                case Opt_err_panic:
-                        clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
-                        clear_opt (sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
-                        set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        set_opt(sbi->s_mount_opt, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        clear_opt(sbi->s_mount_opt, ERRORS_CONT);
-                        clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
-                        set_opt (sbi->s_mount_opt, ERRORS_RO);
+                        set_opt(sbi->s_mount_opt, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        clear_opt (sbi->s_mount_opt, ERRORS_RO);
+                        clear_opt(sbi->s_mount_opt, ERRORS_RO);
-                        clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+                        clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
-                        set_opt (sbi->s_mount_opt, ERRORS_CONT);
+                        set_opt(sbi->s_mount_opt, ERRORS_CONT);
                        break;
                case Opt_nouid32:
-                        set_opt (sbi->s_mount_opt, NO_UID32);
+                        set_opt(sbi->s_mount_opt, NO_UID32);
                        break;
                case Opt_nocheck:
-                        clear_opt (sbi->s_mount_opt, CHECK);
+                        clear_opt(sbi->s_mount_opt, CHECK);
                        break;
                case Opt_debug:
-                        set_opt (sbi->s_mount_opt, DEBUG);
+                        set_opt(sbi->s_mount_opt, DEBUG);
                        break;
                case Opt_oldalloc:
-                        set_opt (sbi->s_mount_opt, OLDALLOC);
+                        set_opt(sbi->s_mount_opt, OLDALLOC);
                        break;
                case Opt_orlov:
-                        clear_opt (sbi->s_mount_opt, OLDALLOC);
+                        clear_opt(sbi->s_mount_opt, OLDALLOC);
                        break;
 #ifdef CONFIG_EXT4DEV_FS_XATTR
                case Opt_user_xattr:
-                        set_opt (sbi->s_mount_opt, XATTR_USER);
+                        set_opt(sbi->s_mount_opt, XATTR_USER);
                        break;
                case Opt_nouser_xattr:
-                        clear_opt (sbi->s_mount_opt, XATTR_USER);
+                        clear_opt(sbi->s_mount_opt, XATTR_USER);
                        break;
 #else
                case Opt_user_xattr:
@@ -1115,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb,
                                       "journal on remount\n");
                                return 0;
                        }
-                        set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
                case Opt_journal_inum:
                        if (is_remount) {
@@ -1145,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb,
                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
-                        set_opt (sbi->s_mount_opt, NOLOAD);
+                        set_opt(sbi->s_mount_opt, NOLOAD);
                        break;
                case Opt_commit:
                        if (match_int(&args[0], &option))
@@ -1331,7 +1331,7 @@ set_qf_format:
                                        "on this filesystem, use tune2fs\n");
                                return 0;
                        }
-                        set_opt (sbi->s_mount_opt, EXTENTS);
+                        set_opt(sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_noextents:
                        /*
@@ -1348,7 +1348,7 @@ set_qf_format:
                                                "-o noextents options\n");
                                return 0;
                        }
-                        clear_opt (sbi->s_mount_opt, EXTENTS);
+                        clear_opt(sbi->s_mount_opt, EXTENTS);
                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
@@ -1374,9 +1374,9 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, DELALLOC);
                        break;
                default:
-                        printk (KERN_ERR
+                        printk(KERN_ERR
-                                "EXT4-fs: Unrecognized mount option \"%s\" "
+                               "EXT4-fs: Unrecognized mount option \"%s\" "
-                                "or missing value\n", p);
+                               "or missing value\n", p);
                        return 0;
                }
        }
@@ -1423,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        int res = 0;
        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-                printk (KERN_ERR "EXT4-fs warning: revision level too high, "
+                printk(KERN_ERR "EXT4-fs warning: revision level too high, "
-                        "forcing read-only mode\n");
+                       "forcing read-only mode\n");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
-                printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+                printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-                        "running e2fsck is recommended\n");
+                       "running e2fsck is recommended\n");
        else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-                printk (KERN_WARNING
+                printk(KERN_WARNING
-                        "EXT4-fs warning: mounting fs with errors, "
+                       "EXT4-fs warning: mounting fs with errors, "
-                        "running e2fsck is recommended\n");
+                       "running e2fsck is recommended\n");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-                printk (KERN_WARNING
+                printk(KERN_WARNING
-                        "EXT4-fs warning: maximal mount count reached, "
+                       "EXT4-fs warning: maximal mount count reached, "
-                        "running e2fsck is recommended\n");
+                       "running e2fsck is recommended\n");
        else if (le32_to_cpu(es->s_checkinterval) &&
                (le32_to_cpu(es->s_lastcheck) +
                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-                printk (KERN_WARNING
+                printk(KERN_WARNING
-                        "EXT4-fs warning: checktime reached, "
+                       "EXT4-fs warning: checktime reached, "
-                        "running e2fsck is recommended\n");
+                       "running e2fsck is recommended\n");
 #if 0
                /* @@@ We _will_ want to clear the valid bit if we find
                 * inconsistencies, to force a fsck at reboot.  But for
@@ -1506,14 +1506,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
        flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
                groups_per_flex;
-        sbi->s_flex_groups = kmalloc(flex_group_count *
+        sbi->s_flex_groups = kzalloc(flex_group_count *
                                     sizeof(struct flex_groups), GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                printk(KERN_ERR "EXT4-fs: not enough memory for "
+                                "%lu flex groups\n", flex_group_count);
                goto failed;
        }
-        memset(sbi->s_flex_groups, 0, flex_group_count *
-               sizeof(struct flex_groups));
        gdp = ext4_get_group_desc(sb, 1, &bh);
        block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
@@ -1597,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);
                block_bitmap = ext4_block_bitmap(sb, gdp);
-                if (block_bitmap < first_block || block_bitmap > last_block)
+                if (block_bitmap < first_block || block_bitmap > last_block) {
-                {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Block bitmap for group %lu not in group "
                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
-                if (inode_bitmap < first_block || inode_bitmap > last_block)
+                if (inode_bitmap < first_block || inode_bitmap > last_block) {
-                {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Inode bitmap for group %lu not in group "
                               "(block %llu)!", i, inode_bitmap);
@@ -1614,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb)
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table < first_block ||
-                    inode_table + sbi->s_itb_per_group - 1 > last_block)
+                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-                {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Inode table for group %lu not in group "
                               "(block %llu)!", i, inode_table);
                        return 0;
                }
+                spin_lock(sb_bgl_lock(sbi, i));
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
                               "Checksum for group %lu failed (%u!=%u)\n",
                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
                               gdp)), le16_to_cpu(gdp->bg_checksum));
-                        return 0;
+                        if (!(sb->s_flags & MS_RDONLY))
+                                return 0;
                }
+                spin_unlock(sb_bgl_lock(sbi, i));
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-        sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+        sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -1654,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb)
 * e2fsck was run on this filesystem, and it must have already done the orphan
 * inode cleanup for us, so we can safely abort without any further action.
 */
-static void ext4_orphan_cleanup (struct super_block * sb,
+static void ext4_orphan_cleanup(struct super_block *sb,
-                                 struct ext4_super_block * es)
+                                struct ext4_super_block *es)
 {
        unsigned int s_flags = sb->s_flags;
        int nr_orphans = 0, nr_truncates = 0;
@@ -1732,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
                iput(inode);  /* The delete magic happens here! */
        }
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
        if (nr_orphans)
                printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
@@ -1899,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
        return 0;
 }
-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi;
        ext4_fsblk_t block;
@@ -1953,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        }
        if (!(bh = sb_bread(sb, logical_sb_block))) {
-                printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+                printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
                goto out_fail;
        }
        /*
@@ -2026,8 +2025,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        set_opt(sbi->s_mount_opt, DELALLOC);
-        if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+        if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
-                            NULL, 0))
+                           NULL, 0))
                goto failed_mount;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2102,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                        goto failed_mount;
                }
-                brelse (bh);
+                brelse(bh);
                logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
                bh = sb_bread(sb, logical_sb_block);
@@ -2114,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-                        printk (KERN_ERR
+                        printk(KERN_ERR
-                                "EXT4-fs: Magic mismatch, very weird !\n");
+                               "EXT4-fs: Magic mismatch, very weird !\n");
                        goto failed_mount;
                }
        }
@@ -2132,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
-                        printk (KERN_ERR
+                        printk(KERN_ERR
-                                "EXT4-fs: unsupported inode size: %d\n",
+                               "EXT4-fs: unsupported inode size: %d\n",
-                                sbi->s_inode_size);
+                               sbi->s_inode_size);
                        goto failed_mount;
                }
                if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
@@ -2166,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_mount_state = le16_to_cpu(es->s_state);
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
-        for (i=0; i < 4; i++)
+        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
        if (sbi->s_blocks_per_group > blocksize * 8) {
-                printk (KERN_ERR
+                printk(KERN_ERR
-                        "EXT4-fs: #blocks per group too big: %lu\n",
+                       "EXT4-fs: #blocks per group too big: %lu\n",
-                        sbi->s_blocks_per_group);
+                       sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > blocksize * 8) {
-                printk (KERN_ERR
+                printk(KERN_ERR
-                        "EXT4-fs: #inodes per group too big: %lu\n",
+                       "EXT4-fs: #inodes per group too big: %lu\n",
-                        sbi->s_inodes_per_group);
+                       sbi->s_inodes_per_group);
                goto failed_mount;
        }
@@ -2213,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_groups_count = blocks_count;
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
-        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-                printk (KERN_ERR "EXT4-fs: not enough memory\n");
+                printk(KERN_ERR "EXT4-fs: not enough memory\n");
                goto failed_mount;
        }
@@ -2226,13 +2225,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                block = descriptor_loc(sb, logical_sb_block, i);
                sbi->s_group_desc[i] = sb_bread(sb, block);
                if (!sbi->s_group_desc[i]) {
-                        printk (KERN_ERR "EXT4-fs: "
+                        printk(KERN_ERR "EXT4-fs: "
-                                "can't read group descriptor %d\n", i);
+                               "can't read group descriptor %d\n", i);
                        db_count = i;
                        goto failed_mount2;
                }
        }
-        if (!ext4_check_descriptors (sb)) {
+        if (!ext4_check_descriptors(sb)) {
                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
                goto failed_mount2;
        }
@@ -2308,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                    EXT4_SB(sb)->s_journal->j_failed_commit) {
                        printk(KERN_CRIT "EXT4-fs error (device %s): "
                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt\n", sb->s_id, 
+                               "%u is corrupt\n", sb->s_id,
                               EXT4_SB(sb)->s_journal->j_failed_commit);
-                        if (test_opt (sb, ERRORS_RO)) {
+                        if (test_opt(sb, ERRORS_RO)) {
-                                printk (KERN_CRIT
+                                printk(KERN_CRIT
-                                        "Mounting filesystem read-only\n");
+                                       "Mounting filesystem read-only\n");
                                sb->s_flags |= MS_RDONLY;
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2332,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                        goto failed_mount3;
        } else {
                if (!silent)
-                        printk (KERN_ERR
+                        printk(KERN_ERR
-                                "ext4: No journal on filesystem on %s\n",
+                               "ext4: No journal on filesystem on %s\n",
-                                sb->s_id);
+                               sb->s_id);
                goto failed_mount3;
        }
@@ -2418,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount4;
        }
-        ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+        ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
        /* determine the minimum size of new large inodes, if present */
        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -2457,12 +2456,12 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        if (needs_recovery)
-                printk (KERN_INFO "EXT4-fs: recovery complete.\n");
+                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
        ext4_mark_recovery_complete(sb, es);
-        printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+        printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
-                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
-                test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
-                "writeback");
+               "writeback");
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
@@ -2575,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 static journal_t *ext4_get_dev_journal(struct super_block *sb,
                                       dev_t j_dev)
 {
-        struct buffer_head * bh;
+        struct buffer_head *bh;
        journal_t *journal;
        ext4_fsblk_t start;
        ext4_fsblk_t len;
        int hblock, blocksize;
        ext4_fsblk_t sb_block;
        unsigned long offset;
-        struct ext4_super_block * es;
+        struct ext4_super_block *es;
        struct block_device *bdev;
        bdev = ext4_blkdev_get(j_dev);
@@ -2697,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb,
                                        "unavailable, cannot proceed.\n");
                                return -EROFS;
                        }
-                        printk (KERN_INFO "EXT4-fs: write access will "
+                        printk(KERN_INFO "EXT4-fs: write access will "
-                                        "be enabled during recovery.\n");
+                               "be enabled during recovery.\n");
                }
        }
@@ -2751,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb,
        return 0;
 }
-static int ext4_create_journal(struct super_block * sb,
+static int ext4_create_journal(struct super_block *sb,
-                               struct ext4_super_block * es,
+                               struct ext4_super_block *es,
                               unsigned int journal_inum)
 {
        journal_t *journal;
@@ -2793,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb,
        return 0;
 }
-static void ext4_commit_super (struct super_block * sb,
+static void ext4_commit_super(struct super_block *sb,
-                               struct ext4_super_block * es,
+                              struct ext4_super_block *es, int sync)
-                               int sync)
 {
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
@@ -2816,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb,
 * remounting) the filesystem readonly, then we will end up with a
 * consistent fs on disk.  Record that fact.
 */
-static void ext4_mark_recovery_complete(struct super_block * sb,
+static void ext4_mark_recovery_complete(struct super_block *sb,
-                                        struct ext4_super_block * es)
+                                        struct ext4_super_block *es)
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
@@ -2839,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
 * has recorded an error from a previous lifetime, move that error to the
 * main filesystem now.
 */
-static void ext4_clear_journal_err(struct super_block * sb,
+static void ext4_clear_journal_err(struct super_block *sb,
-                                   struct ext4_super_block * es)
+                                   struct ext4_super_block *es)
 {
        journal_t *journal;
        int j_errno;
@@ -2865,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super (sb, es, 1);
+                ext4_commit_super(sb, es, 1);
                jbd2_journal_clear_err(journal);
        }
@@ -2898,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb)
 * This implicitly triggers the writebehind on sync().
 */
-static void ext4_write_super (struct super_block * sb)
+static void ext4_write_super(struct super_block *sb)
 {
        if (mutex_trylock(&sb->s_lock) != 0)
                BUG();
@@ -2954,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb)
        }
 }
-static int ext4_remount (struct super_block * sb, int * flags, char * data)
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
-        struct ext4_super_block * es;
+        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
+        ext4_group_t g;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -3039,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
                        }
                        /*
+                         * Make sure the group descriptor checksums
+                         * are sane.  If they aren't, refuse to
+                         * remount r/w.
+                         */
+                        for (g = 0; g < sbi->s_groups_count; g++) {
+                                struct ext4_group_desc *gdp =
+                                        ext4_get_group_desc(sb, g, NULL);
+                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+                                        printk(KERN_ERR
+               "EXT4-fs: ext4_remount: "
+                "Checksum for group %lu failed (%u!=%u)\n",
+                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+                                               le16_to_cpu(gdp->bg_checksum));
+                                        err = -EINVAL;
+                                        goto restore_opts;
+                                }
+                        }
+                        /*
                         * If we have an unprocessed orphan list hanging
                         * around from a previously readonly bdev mount,
                         * require a full umount/remount for now.
@@ -3063,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
                                goto restore_opts;
-                        if (!ext4_setup_super (sb, es, 0))
+                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
@@ -3093,7 +3112,7 @@ restore_opts:
        return err;
 }
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3331,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
        }
        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
-                /* Quotafile not of fs root? */
+                /* Quotafile not in fs root? */
                if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
                        printk(KERN_WARNING
                                "EXT4-fs: Quota file not on filesystem root. "
                                "Journaled quota will not work.\n");
-        }
+        }
        /*
         * When we journal data on quota file, we have to flush journal to see
@@ -3352,8 +3371,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        }
+        err = vfs_quota_on_path(sb, type, format_id, &nd.path);
        path_put(&nd.path);
-        return vfs_quota_on(sb, type, format_id, path, remount);
+        return err;
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93c5fdcdad2e..8954208b4893 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
        char *name = entry->e_name;
        int n;
-        for (n=0; n < entry->e_name_len; n++) {
+        for (n = 0; n < entry->e_name_len; n++) {
                hash = (hash << NAME_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
                       *name++;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3a9ecac8d61f..3222f51c41cf 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
 static struct kmem_cache *fat_cache_cachep;
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct fat_cache *cache = (struct fat_cache *)foo;
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 34541d06e626..cd4a0162e10d 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/dirent.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
@@ -124,10 +123,11 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
 * but ignore that right now.
 * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
 */
-static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
+static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
                       int uni_xlate, struct nls_table *nls)
 {
-        wchar_t *ip, ec;
+        const wchar_t *ip;
+        wchar_t ec;
        unsigned char *op, nc;
        int charlen;
        int k;
@@ -167,6 +167,16 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
        return (op - ascii);
 }
+static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+                                unsigned char *buf, int size)
+{
+        if (sbi->options.utf8)
+                return utf8_wcstombs(buf, uni, size);
+        else
+                return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
+                                   sbi->nls_io);
+}
 static inline int
 fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
 {
@@ -227,6 +237,19 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
        return len;
 }
+static inline int fat_name_match(struct msdos_sb_info *sbi,
+                                 const unsigned char *a, int a_len,
+                                 const unsigned char *b, int b_len)
+{
+        if (a_len != b_len)
+                return 0;
+        if (sbi->options.name_check != 's')
+                return !nls_strnicmp(sbi->nls_io, a, b, a_len);
+        else
+                return !memcmp(a, b, a_len);
+}
 enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
 /**
@@ -302,6 +325,19 @@ parse_long:
 }
 /*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+/*
 * Return values: negative -> error, 0 -> not found, positive -> found,
 * value is the total amount of slots, including the shortname entry.
 */
@@ -312,29 +348,20 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh = NULL;
        struct msdos_dir_entry *de;
-        struct nls_table *nls_io = sbi->nls_io;
        struct nls_table *nls_disk = sbi->nls_disk;
-        wchar_t bufuname[14];
        unsigned char nr_slots;
-        int xlate_len;
+        wchar_t bufuname[14];
        wchar_t *unicode = NULL;
        unsigned char work[MSDOS_NAME];
-        unsigned char *bufname = NULL;
+        unsigned char bufname[FAT_MAX_SHORT_SIZE];
-        int uni_xlate = sbi->options.unicode_xlate;
-        int utf8 = sbi->options.utf8;
-        int anycase = (sbi->options.name_check != 's');
        unsigned short opt_shortname = sbi->options.shortname;
        loff_t cpos = 0;
-        int chl, i, j, last_u, err;
+        int chl, i, j, last_u, err, len;
-        bufname = __getname();
-        if (!bufname)
-                return -ENOMEM;
        err = -ENOENT;
-        while(1) {
+        while (1) {
                if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
-                        goto EODir;
+                        goto end_of_dir;
 parse_record:
                nr_slots = 0;
                if (de->name[0] == DELETED_FLAG)
@@ -353,7 +380,7 @@ parse_record:
                        else if (status == PARSE_NOT_LONGNAME)
                                goto parse_record;
                        else if (status == PARSE_EOF)
-                                goto EODir;
+                                goto end_of_dir;
                }
                memcpy(work, de->name, sizeof(de->name));
@@ -394,30 +421,24 @@ parse_record:
                if (!last_u)
                        continue;
+                /* Compare shortname */
                bufuname[last_u] = 0x0000;
-                xlate_len = utf8
+                len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
-                        ?utf8_wcstombs(bufname, bufuname, PATH_MAX)
+                if (fat_name_match(sbi, name, name_len, bufname, len))
-                        :uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io);
+                        goto found;
-                if (xlate_len == name_len)
-                        if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
-                            (anycase && !nls_strnicmp(nls_io, name, bufname,
-                                                                xlate_len)))
-                                goto Found;
                if (nr_slots) {
-                        xlate_len = utf8
+                        void *longname = unicode + FAT_MAX_UNI_CHARS;
-                                ?utf8_wcstombs(bufname, unicode, PATH_MAX)
+                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
-                                :uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io);
-                        if (xlate_len != name_len)
+                        /* Compare longname */
-                                continue;
+                        len = fat_uni_to_x8(sbi, unicode, longname, size);
-                        if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
+                        if (fat_name_match(sbi, name, name_len, longname, len))
-                            (anycase && !nls_strnicmp(nls_io, name, bufname,
+                                goto found;
-                                                                xlate_len)))
-                                goto Found;
                }
        }
-Found:
+found:
        nr_slots++;     /* include the de */
        sinfo->slot_off = cpos - nr_slots * sizeof(*de);
        sinfo->nr_slots = nr_slots;
@@ -425,9 +446,7 @@ Found:
        sinfo->bh = bh;
        sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
        err = 0;
-EODir:
+end_of_dir:
-        if (bufname)
-                __putname(bufname);
        if (unicode)
                __putname(unicode);
@@ -453,23 +472,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
-        struct nls_table *nls_io = sbi->nls_io;
        struct nls_table *nls_disk = sbi->nls_disk;
-        unsigned char long_slots;
+        unsigned char nr_slots;
-        const char *fill_name;
-        int fill_len;
        wchar_t bufuname[14];
        wchar_t *unicode = NULL;
-        unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
+        unsigned char c, work[MSDOS_NAME];
-        unsigned long lpos, dummy, *furrfu = &lpos;
+        unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname;
-        int uni_xlate = sbi->options.unicode_xlate;
+        unsigned short opt_shortname = sbi->options.shortname;
        int isvfat = sbi->options.isvfat;
-        int utf8 = sbi->options.utf8;
        int nocase = sbi->options.nocase;
-        unsigned short opt_shortname = sbi->options.shortname;
+        const char *fill_name = NULL;
        unsigned long inum;
-        int chi, chl, i, i2, j, last, last_u, dotoffset = 0;
+        unsigned long lpos, dummy, *furrfu = &lpos;
        loff_t cpos;
+        int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0;
        int ret = 0;
        lock_super(sb);
@@ -489,43 +505,58 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
                        cpos = 0;
                }
        }
-        if (cpos & (sizeof(struct msdos_dir_entry)-1)) {
+        if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
                ret = -ENOENT;
                goto out;
        }
        bh = NULL;
-GetNew:
+get_new:
        if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
-                goto EODir;
+                goto end_of_dir;
 parse_record:
-        long_slots = 0;
+        nr_slots = 0;
-        /* Check for long filename entry */
+        /*
-        if (isvfat) {
+         * Check for long filename entry, but if short_only, we don't
+         * need to parse long filename.
+         */
+        if (isvfat && !short_only) {
                if (de->name[0] == DELETED_FLAG)
-                        goto RecEnd;
+                        goto record_end;
                if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
-                        goto RecEnd;
+                        goto record_end;
                if (de->attr != ATTR_EXT && IS_FREE(de->name))
-                        goto RecEnd;
+                        goto record_end;
        } else {
                if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
-                        goto RecEnd;
+                        goto record_end;
        }
        if (isvfat && de->attr == ATTR_EXT) {
                int status = fat_parse_long(inode, &cpos, &bh, &de,
-                                            &unicode, &long_slots);
+                                            &unicode, &nr_slots);
                if (status < 0) {
                        filp->f_pos = cpos;
                        ret = status;
                        goto out;
                } else if (status == PARSE_INVALID)
-                        goto RecEnd;
+                        goto record_end;
                else if (status == PARSE_NOT_LONGNAME)
                        goto parse_record;
                else if (status == PARSE_EOF)
-                        goto EODir;
+                        goto end_of_dir;
+                if (nr_slots) {
+                        void *longname = unicode + FAT_MAX_UNI_CHARS;
+                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+                        int len = fat_uni_to_x8(sbi, unicode, longname, size);
+                        fill_name = longname;
+                        fill_len = len;
+                        /* !both && !short_only, so we don't need shortname. */
+                        if (!both)
+                                goto start_filldir;
+                }
        }
        if (sbi->options.dotsOK) {
@@ -587,12 +618,32 @@ parse_record:
                }
        }
        if (!last)
-                goto RecEnd;
+                goto record_end;
        i = last + dotoffset;
        j = last_u;
-        lpos = cpos - (long_slots+1)*sizeof(struct msdos_dir_entry);
+        if (isvfat) {
+                bufuname[j] = 0x0000;
+                i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+        }
+        if (nr_slots) {
+                /* hack for fat_ioctl_filldir() */
+                struct fat_ioctl_filldir_callback *p = dirent;
+                p->longname = fill_name;
+                p->long_len = fill_len;
+                p->shortname = bufname;
+                p->short_len = i;
+                fill_name = NULL;
+                fill_len = 0;
+        } else {
+                fill_name = bufname;
+                fill_len = i;
+        }
+start_filldir:
+        lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
        if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
                inum = inode->i_ino;
        else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
@@ -607,49 +658,17 @@ parse_record:
                        inum = iunique(sb, MSDOS_ROOT_INO);
        }
-        if (isvfat) {
-                bufuname[j] = 0x0000;
-                i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname))
-                         : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io);
-        }
-        fill_name = bufname;
-        fill_len = i;
-        if (!short_only && long_slots) {
-                /* convert the unicode long name. 261 is maximum size
-                 * of unicode buffer. (13 * slots + nul) */
-                void *longname = unicode + 261;
-                int buf_size = PATH_MAX - (261 * sizeof(unicode[0]));
-                int long_len = utf8
-                        ? utf8_wcstombs(longname, unicode, buf_size)
-                        : uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io);
-                if (!both) {
-                        fill_name = longname;
-                        fill_len = long_len;
-                } else {
-                        /* hack for fat_ioctl_filldir() */
-                        struct fat_ioctl_filldir_callback *p = dirent;
-                        p->longname = longname;
-                        p->long_len = long_len;
-                        p->shortname = bufname;
-                        p->short_len = i;
-                        fill_name = NULL;
-                        fill_len = 0;
-                }
-        }
        if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
                    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-                goto FillFailed;
+                goto fill_failed;
-RecEnd:
+record_end:
        furrfu = &lpos;
        filp->f_pos = cpos;
-        goto GetNew;
+        goto get_new;
-EODir:
+end_of_dir:
        filp->f_pos = cpos;
-FillFailed:
+fill_failed:
        brelse(bh);
        if (unicode)
                __putname(unicode);
@@ -715,7 +734,7 @@ efault:									   \
        return -EFAULT;                                                    \
 }
-FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent)
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
 static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
                             void __user *dirent, filldir_t filldir,
@@ -741,7 +760,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 static int fat_dir_ioctl(struct inode *inode, struct file *filp,
                         unsigned int cmd, unsigned long arg)
 {
-        struct dirent __user *d1 = (struct dirent __user *)arg;
+        struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
        int short_only, both;
        switch (cmd) {
@@ -757,7 +776,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
                return fat_generic_ioctl(inode, filp, cmd, arg);
        }
-        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2])))
+        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
                return -EFAULT;
        /*
         * Yes, we don't need this put_user() absolutely. However old
@@ -1082,7 +1101,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
                goto error_free;
        }
-        fat_date_unix2dos(ts->tv_sec, &time, &date);
+        fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
        de = (struct msdos_dir_entry *)bhs[0]->b_data;
        /* filling the new directory slots ("." and ".." entries) */
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c672df4036e9..ddde37025ca6 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -15,6 +15,8 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
                      unsigned int cmd, unsigned long arg)
@@ -64,6 +66,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
                /* Equivalent to a chmod() */
                ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+                ia.ia_ctime = current_fs_time(inode->i_sb);
                if (is_dir) {
                        ia.ia_mode = MSDOS_MKMODE(attr,
                                S_IRWXUGO & ~sbi->options.fs_dmask)
@@ -90,11 +93,21 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
                        }
                }
+                /*
+                 * The security check is questionable...  We single
+                 * out the RO attribute for checking by the security
+                 * module, just because it maps to a file mode.
+                 */
+                err = security_inode_setattr(filp->f_path.dentry, &ia);
+                if (err)
+                        goto up;
                /* This MUST be done before doing anything irreversible... */
-                err = notify_change(filp->f_path.dentry, &ia);
+                err = fat_setattr(filp->f_path.dentry, &ia);
                if (err)
                        goto up;
+                fsnotify_change(filp->f_path.dentry, ia.ia_valid);
                if (sbi->options.sys_immutable) {
                        if (attr & ATTR_SYS)
                                inode->i_flags |= S_IMMUTABLE;
@@ -300,6 +313,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
        return 0;
 }
+#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -323,9 +338,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        /* Check for setting the inode time. */
        ia_valid = attr->ia_valid;
-        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+        if (ia_valid & TIMES_SET_FLAGS) {
                if (fat_allow_set_time(sbi, inode))
-                        attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET);
+                        attr->ia_valid &= ~TIMES_SET_FLAGS;
        }
        error = inode_change_ok(inode, attr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 46a4508ffd2e..80ff3381fa21 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -382,17 +382,20 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
        inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
                           & ~((loff_t)sbi->cluster_size - 1)) >> 9;
        inode->i_mtime.tv_sec =
-                date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date));
+                date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
+                              sbi->options.tz_utc);
        inode->i_mtime.tv_nsec = 0;
        if (sbi->options.isvfat) {
                int secs = de->ctime_cs / 100;
                int csecs = de->ctime_cs % 100;
                inode->i_ctime.tv_sec  =
                        date_dos2unix(le16_to_cpu(de->ctime),
-                                      le16_to_cpu(de->cdate)) + secs;
+                                      le16_to_cpu(de->cdate),
+                                      sbi->options.tz_utc) + secs;
                inode->i_ctime.tv_nsec = csecs * 10000000;
                inode->i_atime.tv_sec =
-                        date_dos2unix(0, le16_to_cpu(de->adate));
+                        date_dos2unix(0, le16_to_cpu(de->adate),
+                                      sbi->options.tz_utc);
                inode->i_atime.tv_nsec = 0;
        } else
                inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -495,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
        kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -559,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)
        struct buffer_head *bh;
        struct msdos_dir_entry *raw_entry;
        loff_t i_pos;
-        int err = 0;
+        int err;
 retry:
        i_pos = MSDOS_I(inode)->i_pos;
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
                       "for updating (i_pos %lld)\n", i_pos);
-                err = -EIO;
+                return -EIO;
-                goto out;
        }
        spin_lock(&sbi->inode_hash_lock);
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_super(sb);
                goto retry;
        }
@@ -591,21 +591,23 @@ retry:
        raw_entry->attr = fat_attr(inode);
        raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
        raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-        fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date);
+        fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
+                          &raw_entry->date, sbi->options.tz_utc);
        if (sbi->options.isvfat) {
                __le16 atime;
-                fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate);
+                fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
-                fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate);
+                                  &raw_entry->cdate, sbi->options.tz_utc);
+                fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
+                                  &raw_entry->adate, sbi->options.tz_utc);
                raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
                        inode->i_ctime.tv_nsec / 10000000;
        }
        spin_unlock(&sbi->inode_hash_lock);
        mark_buffer_dirty(bh);
+        err = 0;
        if (wait)
                err = sync_dirty_buffer(bh);
        brelse(bh);
-out:
-        unlock_super(sb);
        return err;
 }
@@ -836,6 +838,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
        }
        if (sbi->options.flush)
                seq_puts(m, ",flush");
+        if (opts->tz_utc)
+                seq_puts(m, ",tz=UTC");
        return 0;
 }
@@ -848,7 +852,7 @@ enum {
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-        Opt_obsolate, Opt_flush, Opt_err,
+        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
 };
 static match_table_t fat_tokens = {
@@ -883,6 +887,7 @@ static match_table_t fat_tokens = {
        {Opt_obsolate, "cvf_options=%100s"},
        {Opt_obsolate, "posix"},
        {Opt_flush, "flush"},
+        {Opt_tz_utc, "tz=UTC"},
        {Opt_err, NULL},
 };
 static match_table_t msdos_tokens = {
@@ -947,10 +952,11 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->utf8 = opts->unicode_xlate = 0;
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
+        opts->tz_utc = 0;
        *debug = 0;
        if (!options)
-                return 0;
+                goto out;
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
@@ -1036,6 +1042,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_flush:
                        opts->flush = 1;
                        break;
+                case Opt_tz_utc:
+                        opts->tz_utc = 1;
+                        break;
                /* msdos specific */
                case Opt_dots:
@@ -1104,10 +1113,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                        return -EINVAL;
                }
        }
+out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
                printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
-                       " for FAT filesystems, filesystem will be case sensitive!\n");
+                       " for FAT filesystems, filesystem will be "
+                       "case sensitive!\n");
        }
        /* If user doesn't specify allow_utime, it's initialized from dmask. */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 61f23511eacf..79fb98ad36d4 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -142,7 +142,7 @@ static int day_n[] = {
 };
 /* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date)
+int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
 {
        int month, year, secs;
@@ -156,16 +156,18 @@ int date_dos2unix(unsigned short time, unsigned short date)
            ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
            month < 2 ? 1 : 0)+3653);
                        /* days since 1.1.70 plus 80's leap day */
-        secs += sys_tz.tz_minuteswest*60;
+        if (!tz_utc)
+                secs += sys_tz.tz_minuteswest*60;
        return secs;
 }
 /* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
+void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
 {
        int day, year, nl_day, month;
-        unix_date -= sys_tz.tz_minuteswest*60;
+        if (!tz_utc)
+                unix_date -= sys_tz.tz_minuteswest*60;
        /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
        if (unix_date < 315532800)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9679fcbdeaa0..ac4f7db9f134 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -49,82 +49,6 @@ static int get_close_on_exec(unsigned int fd)
        return res;
 }
-/*
- * locate_fd finds a free file descriptor in the open_fds fdset,
- * expanding the fd arrays if necessary.  Must be called with the
- * file_lock held for write.
- */
-static int locate_fd(unsigned int orig_start, int cloexec)
-{
-        struct files_struct *files = current->files;
-        unsigned int newfd;
-        unsigned int start;
-        int error;
-        struct fdtable *fdt;
-        spin_lock(&files->file_lock);
-        error = -EINVAL;
-        if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-                goto out;
-repeat:
-        fdt = files_fdtable(files);
-        /*
-         * Someone might have closed fd's in the range
-         * orig_start..fdt->next_fd
-         */
-        start = orig_start;
-        if (start < files->next_fd)
-                start = files->next_fd;
-        newfd = start;
-        if (start < fdt->max_fds)
-                newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
-                                           fdt->max_fds, start);
-        
-        error = -EMFILE;
-        if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-                goto out;
-        error = expand_files(files, newfd);
-        if (error < 0)
-                goto out;
-        /*
-         * If we needed to expand the fs array we
-         * might have blocked - try again.
-         */
-        if (error)
-                goto repeat;
-        if (start <= files->next_fd)
-                files->next_fd = newfd + 1;
-        FD_SET(newfd, fdt->open_fds);
-        if (cloexec)
-                FD_SET(newfd, fdt->close_on_exec);
-        else
-                FD_CLR(newfd, fdt->close_on_exec);
-        error = newfd;
-out:
-        spin_unlock(&files->file_lock);
-        return error;
-}
-static int dupfd(struct file *file, unsigned int start, int cloexec)
-{
-        int fd = locate_fd(start, cloexec);
-        if (fd >= 0)
-                fd_install(fd, file);
-        else
-                fput(file);
-        return fd;
-}
 asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
        int err = -EBADF;
@@ -135,35 +59,39 @@ asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;
-        spin_lock(&files->file_lock);
+        if (unlikely(oldfd == newfd))
-        if (!(file = fcheck(oldfd)))
+                return -EINVAL;
-                goto out_unlock;
-        err = newfd;
-        if (newfd == oldfd)
-                goto out_unlock;
-        err = -EBADF;
-        if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-                goto out_unlock;
-        get_file(file);                 /* We are now finished with oldfd */
+        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
-        if (err < 0)
+        file = fcheck(oldfd);
-                goto out_fput;
+        if (unlikely(!file))
+                goto Ebadf;
-        /* To avoid races with open() and dup(), we will mark the fd as
+        if (unlikely(err < 0)) {
-         * in-use in the open-file bitmap throughout the entire dup2()
+                if (err == -EMFILE)
-         * process.  This is quite safe: do_close() uses the fd array
+                        goto Ebadf;
-         * entry, not the bitmap, to decide what work needs to be
+                goto out_unlock;
-         * done.  --sct */
+        }
-        /* Doesn't work. open() might be there first. --AV */
+        /*
+         * We need to detect attempts to do dup2() over allocated but still
-        /* Yes. It's a race. In user space. Nothing sane to do */
+         * not finished descriptor.  NB: OpenBSD avoids that at the price of
+         * extra work in their equivalent of fget() - they insert struct
+         * file immediately after grabbing descriptor, mark it larval if
+         * more work (e.g. actual opening) is needed and make sure that
+         * fget() treats larval files as absent.  Potentially interesting,
+         * but while extra work in fget() is trivial, locking implications
+         * and amount of surgery on open()-related paths in VFS are not.
+         * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
+         * deadlocks in rather amusing ways, AFAICS.  All of that is out of
+         * scope of POSIX or SUS, since neither considers shared descriptor
+         * tables and this condition does not arise without those.
+         */
        err = -EBUSY;
        fdt = files_fdtable(files);
        tofree = fdt->fd[newfd];
        if (!tofree && FD_ISSET(newfd, fdt->open_fds))
-                goto out_fput;
+                goto out_unlock;
+        get_file(file);
        rcu_assign_pointer(fdt->fd[newfd], file);
        FD_SET(newfd, fdt->open_fds);
        if (flags & O_CLOEXEC)
@@ -174,31 +102,41 @@ asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
        if (tofree)
                filp_close(tofree, files);
-        err = newfd;
-out:
-        return err;
-out_unlock:
-        spin_unlock(&files->file_lock);
-        goto out;
-out_fput:
+        return newfd;
+Ebadf:
+        err = -EBADF;
+out_unlock:
        spin_unlock(&files->file_lock);
-        fput(file);
+        return err;
-        goto out;
 }
 asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 {
+        if (unlikely(newfd == oldfd)) { /* corner case */
+                struct files_struct *files = current->files;
+                rcu_read_lock();
+                if (!fcheck_files(files, oldfd))
+                        oldfd = -EBADF;
+                rcu_read_unlock();
+                return oldfd;
+        }
        return sys_dup3(oldfd, newfd, 0);
 }
 asmlinkage long sys_dup(unsigned int fildes)
 {
        int ret = -EBADF;
-        struct file * file = fget(fildes);
+        struct file *file = fget(fildes);
-        if (file)
+        if (file) {
-                ret = dupfd(file, 0, 0);
+                ret = get_unused_fd();
+                if (ret >= 0)
+                        fd_install(ret, file);
+                else
+                        fput(file);
+        }
        return ret;
 }
@@ -321,8 +259,13 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        switch (cmd) {
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
-                get_file(filp);
+                if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-                err = dupfd(filp, arg, cmd == F_DUPFD_CLOEXEC);
+                        break;
+                err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
+                if (err >= 0) {
+                        get_file(filp);
+                        fd_install(err, filp);
+                }
                break;
        case F_GETFD:
                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
diff --git a/fs/fifo.c b/fs/fifo.c
index 9785e36f81e7..987bf9411495 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -57,7 +57,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
         *  opened, even when there is no process writing the FIFO.
         */
-                filp->f_op = &read_fifo_fops;
+                filp->f_op = &read_pipefifo_fops;
                pipe->r_counter++;
                if (pipe->readers++ == 0)
                        wake_up_partner(inode);
@@ -86,7 +86,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
                if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
                        goto err;
-                filp->f_op = &write_fifo_fops;
+                filp->f_op = &write_pipefifo_fops;
                pipe->w_counter++;
                if (!pipe->writers++)
                        wake_up_partner(inode);
@@ -105,7 +105,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
         *  This implementation will NEVER block on a O_RDWR open, since
         *  the process can at least talk to itself.
         */
-                filp->f_op = &rdwr_fifo_fops;
+                filp->f_op = &rdwr_pipefifo_fops;
                pipe->readers++;
                pipe->writers++;
@@ -151,5 +151,5 @@ err_nocleanup:
 * depending on the access mode of the file...
 */
 const struct file_operations def_fifo_fops = {
-        .open           = fifo_open,    /* will set read or write pipe_fops */
+        .open           = fifo_open,    /* will set read_ or write_pipefifo_fops */
 };
diff --git a/fs/file.c b/fs/file.c
index 7b3887e054d0..f313314f996f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,6 +6,7 @@
 *  Manage the dynamic fd arrays in the process files_struct.
 */
+#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/time.h>
@@ -250,9 +251,18 @@ int expand_files(struct files_struct *files, int nr)
        struct fdtable *fdt;
        fdt = files_fdtable(files);
+        /*
+         * N.B. For clone tasks sharing a files structure, this test
+         * will limit the total number of files that can be opened.
+         */
+        if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+                return -EMFILE;
        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return 0;
        /* Can we expand? */
        if (nr >= sysctl_nr_open)
                return -EMFILE;
@@ -423,3 +433,63 @@ struct files_struct init_files = {
        },
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
+/*
+ * allocate a file descriptor, mark it busy.
+ */
+int alloc_fd(unsigned start, unsigned flags)
+{
+        struct files_struct *files = current->files;
+        unsigned int fd;
+        int error;
+        struct fdtable *fdt;
+        spin_lock(&files->file_lock);
+repeat:
+        fdt = files_fdtable(files);
+        fd = start;
+        if (fd < files->next_fd)
+                fd = files->next_fd;
+        if (fd < fdt->max_fds)
+                fd = find_next_zero_bit(fdt->open_fds->fds_bits,
+                                           fdt->max_fds, fd);
+        error = expand_files(files, fd);
+        if (error < 0)
+                goto out;
+        /*
+         * If we needed to expand the fs array we
+         * might have blocked - try again.
+         */
+        if (error)
+                goto repeat;
+        if (start <= files->next_fd)
+                files->next_fd = fd + 1;
+        FD_SET(fd, fdt->open_fds);
+        if (flags & O_CLOEXEC)
+                FD_SET(fd, fdt->close_on_exec);
+        else
+                FD_CLR(fd, fdt->close_on_exec);
+        error = fd;
+#if 1
+        /* Sanity check */
+        if (rcu_dereference(fdt->fd[fd]) != NULL) {
+                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
+                rcu_assign_pointer(fdt->fd[fd], NULL);
+        }
+#endif
+out:
+        spin_unlock(&files->file_lock);
+        return error;
+}
+int get_unused_fd(void)
+{
+        return alloc_fd(0, 0);
+}
+EXPORT_SYMBOL(get_unused_fd);
diff --git a/fs/file_table.c b/fs/file_table.c
index 83084225b4c3..f45a4493f9e7 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -120,7 +120,7 @@ struct file *get_empty_filp(void)
        tsk = current;
        INIT_LIST_HEAD(&f->f_u.fu_list);
-        atomic_set(&f->f_count, 1);
+        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
        f->f_uid = tsk->fsuid;
        f->f_gid = tsk->fsgid;
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(init_file);
 void fput(struct file *file)
 {
-        if (atomic_dec_and_test(&file->f_count))
+        if (atomic_long_dec_and_test(&file->f_count))
                __fput(file);
 }
@@ -294,7 +294,7 @@ struct file *fget(unsigned int fd)
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
-                if (!atomic_inc_not_zero(&file->f_count)) {
+                if (!atomic_long_inc_not_zero(&file->f_count)) {
                        /* File object ref couldn't be taken */
                        rcu_read_unlock();
                        return NULL;
@@ -326,7 +326,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
                rcu_read_lock();
                file = fcheck_files(files, fd);
                if (file) {
-                        if (atomic_inc_not_zero(&file->f_count))
+                        if (atomic_long_inc_not_zero(&file->f_count))
                                *fput_needed = 1;
                        else
                                /* Didn't get the reference, someone's freed */
@@ -341,7 +341,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 void put_filp(struct file *file)
 {
-        if (atomic_dec_and_test(&file->f_count)) {
+        if (atomic_long_dec_and_test(&file->f_count)) {
                security_file_free(file);
                file_kill(file);
                file_free(file);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2060bf06b906..fd03330cadeb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -97,7 +97,7 @@ void fuse_invalidate_attr(struct inode *inode)
 * timeout is unknown (unlink, rmdir, rename and in some cases
 * lookup)
 */
-static void fuse_invalidate_entry_cache(struct dentry *entry)
+void fuse_invalidate_entry_cache(struct dentry *entry)
 {
        fuse_dentry_settime(entry, 0);
 }
@@ -112,18 +112,16 @@ static void fuse_invalidate_entry(struct dentry *entry)
        fuse_invalidate_entry_cache(entry);
 }
-static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req,
-                             struct dentry *entry,
+                             u64 nodeid, struct qstr *name,
                             struct fuse_entry_out *outarg)
 {
-        struct fuse_conn *fc = get_fuse_conn(dir);
        memset(outarg, 0, sizeof(struct fuse_entry_out));
        req->in.h.opcode = FUSE_LOOKUP;
-        req->in.h.nodeid = get_node_id(dir);
+        req->in.h.nodeid = nodeid;
        req->in.numargs = 1;
-        req->in.args[0].size = entry->d_name.len + 1;
+        req->in.args[0].size = name->len + 1;
-        req->in.args[0].value = entry->d_name.name;
+        req->in.args[0].value = name->name;
        req->out.numargs = 1;
        if (fc->minor < 9)
                req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
@@ -189,7 +187,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                attr_version = fuse_get_attr_version(fc);
                parent = dget_parent(entry);
-                fuse_lookup_init(req, parent->d_inode, entry, &outarg);
+                fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
+                                 &entry->d_name, &outarg);
                request_send(fc, req);
                dput(parent);
                err = req->out.h.error;
@@ -225,7 +224,7 @@ static int invalid_nodeid(u64 nodeid)
        return !nodeid || nodeid == FUSE_ROOT_ID;
 }
-static struct dentry_operations fuse_dentry_operations = {
+struct dentry_operations fuse_dentry_operations = {
        .d_revalidate   = fuse_dentry_revalidate,
 };
@@ -239,85 +238,127 @@ int fuse_valid_type(int m)
 * Add a directory inode to a dentry, ensuring that no other dentry
 * refers to this inode.  Called with fc->inst_mutex.
 */
-static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+static struct dentry *fuse_d_add_directory(struct dentry *entry,
+                                           struct inode *inode)
 {
        struct dentry *alias = d_find_alias(inode);
-        if (alias) {
+        if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
                /* This tries to shrink the subtree below alias */
                fuse_invalidate_entry(alias);
                dput(alias);
                if (!list_empty(&inode->i_dentry))
-                        return -EBUSY;
+                        return ERR_PTR(-EBUSY);
+        } else {
+                dput(alias);
        }
-        d_add(entry, inode);
+        return d_splice_alias(inode, entry);
-        return 0;
 }
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
-                                  struct nameidata *nd)
+                     struct fuse_entry_out *outarg, struct inode **inode)
 {
-        int err;
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
-        struct fuse_entry_out outarg;
-        struct inode *inode = NULL;
-        struct fuse_conn *fc = get_fuse_conn(dir);
        struct fuse_req *req;
        struct fuse_req *forget_req;
        u64 attr_version;
+        int err;
-        if (entry->d_name.len > FUSE_NAME_MAX)
+        *inode = NULL;
-                return ERR_PTR(-ENAMETOOLONG);
+        err = -ENAMETOOLONG;
+        if (name->len > FUSE_NAME_MAX)
+                goto out;
        req = fuse_get_req(fc);
+        err = PTR_ERR(req);
        if (IS_ERR(req))
-                return ERR_CAST(req);
+                goto out;
        forget_req = fuse_get_req(fc);
+        err = PTR_ERR(forget_req);
        if (IS_ERR(forget_req)) {
                fuse_put_request(fc, req);
-                return ERR_CAST(forget_req);
+                goto out;
        }
        attr_version = fuse_get_attr_version(fc);
-        fuse_lookup_init(req, dir, entry, &outarg);
+        fuse_lookup_init(fc, req, nodeid, name, outarg);
        request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        /* Zero nodeid is same as -ENOENT, but with valid timeout */
-        if (!err && outarg.nodeid &&
+        if (err || !outarg->nodeid)
-            (invalid_nodeid(outarg.nodeid) ||
+                goto out_put_forget;
-             !fuse_valid_type(outarg.attr.mode)))
-                err = -EIO;
+        err = -EIO;
-        if (!err && outarg.nodeid) {
+        if (!outarg->nodeid)
-                inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+                goto out_put_forget;
-                                  &outarg.attr, entry_attr_timeout(&outarg),
+        if (!fuse_valid_type(outarg->attr.mode))
-                                  attr_version);
+                goto out_put_forget;
-                if (!inode) {
-                        fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
+        *inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
-                        return ERR_PTR(-ENOMEM);
+                           &outarg->attr, entry_attr_timeout(outarg),
-                }
+                           attr_version);
+        err = -ENOMEM;
+        if (!*inode) {
+                fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+                goto out;
        }
+        err = 0;
+ out_put_forget:
        fuse_put_request(fc, forget_req);
-        if (err && err != -ENOENT)
+ out:
-                return ERR_PTR(err);
+        return err;
+}
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+                                  struct nameidata *nd)
+{
+        int err;
+        struct fuse_entry_out outarg;
+        struct inode *inode;
+        struct dentry *newent;
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        bool outarg_valid = true;
+        err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+                               &outarg, &inode);
+        if (err == -ENOENT) {
+                outarg_valid = false;
+                err = 0;
+        }
+        if (err)
+                goto out_err;
+        err = -EIO;
+        if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+                goto out_iput;
        if (inode && S_ISDIR(inode->i_mode)) {
                mutex_lock(&fc->inst_mutex);
-                err = fuse_d_add_directory(entry, inode);
+                newent = fuse_d_add_directory(entry, inode);
                mutex_unlock(&fc->inst_mutex);
-                if (err) {
+                err = PTR_ERR(newent);
-                        iput(inode);
+                if (IS_ERR(newent))
-                        return ERR_PTR(err);
+                        goto out_iput;
-                }
+        } else {
-        } else
+                newent = d_splice_alias(inode, entry);
-                d_add(entry, inode);
+        }
+        entry = newent ? newent : entry;
        entry->d_op = &fuse_dentry_operations;
-        if (!err)
+        if (outarg_valid)
                fuse_change_entry_timeout(entry, &outarg);
        else
                fuse_invalidate_entry_cache(entry);
-        return NULL;
+        return newent;
+ out_iput:
+        iput(inode);
+ out_err:
+        return ERR_PTR(err);
 }
 /*
@@ -857,7 +898,7 @@ static int fuse_access(struct inode *inode, int mask)
                return PTR_ERR(req);
        memset(&inarg, 0, sizeof(inarg));
-        inarg.mask = mask;
+        inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
        req->in.h.opcode = FUSE_ACCESS;
        req->in.h.nodeid = get_node_id(inode);
        req->in.numargs = 1;
@@ -886,7 +927,7 @@ static int fuse_access(struct inode *inode, int mask)
 * access request is sent.  Execute permission is still checked
 * locally based on file mode.
 */
-static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int fuse_permission(struct inode *inode, int mask)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        bool refreshed = false;
@@ -921,7 +962,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
                   exist.  So if permissions are revoked this won't be
                   noticed immediately, only after the attribute
                   timeout has expired */
-        } else if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR))) {
+        } else if (mask & MAY_ACCESS) {
                err = fuse_access(inode, mask);
        } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8092f0d9fd1f..2bada6bbc317 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -893,7 +893,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (count == 0)
                goto out;
-        err = remove_suid(file->f_path.dentry);
+        err = file_remove_suid(file);
        if (err)
                goto out;
@@ -1341,6 +1341,11 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
        pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
        int err;
+        if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+                /* NLM needs asynchronous locks, which we don't support yet */
+                return -ENOLCK;
+        }
        /* Unlock on close is handled by the flush method */
        if (fl->fl_flags & FL_CLOSE)
                return 0;
@@ -1365,7 +1370,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
        struct fuse_conn *fc = get_fuse_conn(inode);
        int err;
-        if (cmd == F_GETLK) {
+        if (cmd == F_CANCELLK) {
+                err = 0;
+        } else if (cmd == F_GETLK) {
                if (fc->no_lock) {
                        posix_test_lock(file, fl);
                        err = 0;
@@ -1373,7 +1380,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
                        err = fuse_getlk(file, fl);
        } else {
                if (fc->no_lock)
-                        err = posix_lock_file_wait(file, fl);
+                        err = posix_lock_file(file, fl, NULL);
                else
                        err = fuse_setlk(file, fl, 0);
        }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bae948657c4f..3a876076bdd1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -363,6 +363,9 @@ struct fuse_conn {
        /** Do not send separate SETATTR request before open(O_TRUNC)  */
        unsigned atomic_o_trunc : 1;
+        /** Filesystem supports NFS exporting.  Only set in INIT */
+        unsigned export_support : 1;
        /*
         * The following bitfields are only for optimization purposes
         * and hence races in setting them will not cause malfunction
@@ -464,6 +467,8 @@ static inline u64 get_node_id(struct inode *inode)
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
+extern struct dentry_operations fuse_dentry_operations;
 /**
 * Get a filled in inode
 */
@@ -471,6 +476,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
                        int generation, struct fuse_attr *attr,
                        u64 attr_valid, u64 attr_version);
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+                     struct fuse_entry_out *outarg, struct inode **inode);
 /**
 * Send FORGET command
 */
@@ -604,6 +612,8 @@ void fuse_abort_conn(struct fuse_conn *fc);
 */
 void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_entry_cache(struct dentry *entry);
 /**
 * Acquire reference to fuse_conn
 */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3141690558c8..d2249f174e20 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -18,6 +18,7 @@
 #include <linux/statfs.h>
 #include <linux/random.h>
 #include <linux/sched.h>
+#include <linux/exportfs.h>
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -552,6 +553,174 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
        return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
+struct fuse_inode_handle
+{
+        u64 nodeid;
+        u32 generation;
+};
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+                                      struct fuse_inode_handle *handle)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        struct inode *inode;
+        struct dentry *entry;
+        int err = -ESTALE;
+        if (handle->nodeid == 0)
+                goto out_err;
+        inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+        if (!inode) {
+                struct fuse_entry_out outarg;
+                struct qstr name;
+                if (!fc->export_support)
+                        goto out_err;
+                name.len = 1;
+                name.name = ".";
+                err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+                                       &inode);
+                if (err && err != -ENOENT)
+                        goto out_err;
+                if (err || !inode) {
+                        err = -ESTALE;
+                        goto out_err;
+                }
+                err = -EIO;
+                if (get_node_id(inode) != handle->nodeid)
+                        goto out_iput;
+        }
+        err = -ESTALE;
+        if (inode->i_generation != handle->generation)
+                goto out_iput;
+        entry = d_alloc_anon(inode);
+        err = -ENOMEM;
+        if (!entry)
+                goto out_iput;
+        if (get_node_id(inode) != FUSE_ROOT_ID) {
+                entry->d_op = &fuse_dentry_operations;
+                fuse_invalidate_entry_cache(entry);
+        }
+        return entry;
+ out_iput:
+        iput(inode);
+ out_err:
+        return ERR_PTR(err);
+}
+static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+                           int connectable)
+{
+        struct inode *inode = dentry->d_inode;
+        bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
+        int len = encode_parent ? 6 : 3;
+        u64 nodeid;
+        u32 generation;
+        if (*max_len < len)
+                return  255;
+        nodeid = get_fuse_inode(inode)->nodeid;
+        generation = inode->i_generation;
+        fh[0] = (u32)(nodeid >> 32);
+        fh[1] = (u32)(nodeid & 0xffffffff);
+        fh[2] = generation;
+        if (encode_parent) {
+                struct inode *parent;
+                spin_lock(&dentry->d_lock);
+                parent = dentry->d_parent->d_inode;
+                nodeid = get_fuse_inode(parent)->nodeid;
+                generation = parent->i_generation;
+                spin_unlock(&dentry->d_lock);
+                fh[3] = (u32)(nodeid >> 32);
+                fh[4] = (u32)(nodeid & 0xffffffff);
+                fh[5] = generation;
+        }
+        *max_len = len;
+        return encode_parent ? 0x82 : 0x81;
+}
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+                struct fid *fid, int fh_len, int fh_type)
+{
+        struct fuse_inode_handle handle;
+        if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+                return NULL;
+        handle.nodeid = (u64) fid->raw[0] << 32;
+        handle.nodeid |= (u64) fid->raw[1];
+        handle.generation = fid->raw[2];
+        return fuse_get_dentry(sb, &handle);
+}
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+                struct fid *fid, int fh_len, int fh_type)
+{
+        struct fuse_inode_handle parent;
+        if (fh_type != 0x82 || fh_len < 6)
+                return NULL;
+        parent.nodeid = (u64) fid->raw[3] << 32;
+        parent.nodeid |= (u64) fid->raw[4];
+        parent.generation = fid->raw[5];
+        return fuse_get_dentry(sb, &parent);
+}
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+        struct inode *child_inode = child->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(child_inode);
+        struct inode *inode;
+        struct dentry *parent;
+        struct fuse_entry_out outarg;
+        struct qstr name;
+        int err;
+        if (!fc->export_support)
+                return ERR_PTR(-ESTALE);
+        name.len = 2;
+        name.name = "..";
+        err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+                               &name, &outarg, &inode);
+        if (err && err != -ENOENT)
+                return ERR_PTR(err);
+        if (err || !inode)
+                return ERR_PTR(-ESTALE);
+        parent = d_alloc_anon(inode);
+        if (!parent) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        if (get_node_id(inode) != FUSE_ROOT_ID) {
+                parent->d_op = &fuse_dentry_operations;
+                fuse_invalidate_entry_cache(parent);
+        }
+        return parent;
+}
+static const struct export_operations fuse_export_operations = {
+        .fh_to_dentry   = fuse_fh_to_dentry,
+        .fh_to_parent   = fuse_fh_to_parent,
+        .encode_fh      = fuse_encode_fh,
+        .get_parent     = fuse_get_parent,
+};
 static const struct super_operations fuse_super_operations = {
        .alloc_inode    = fuse_alloc_inode,
        .destroy_inode  = fuse_destroy_inode,
@@ -581,6 +750,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                                fc->no_lock = 1;
                        if (arg->flags & FUSE_ATOMIC_O_TRUNC)
                                fc->atomic_o_trunc = 1;
+                        if (arg->minor >= 9) {
+                                /* LOOKUP has dependency on proto version */
+                                if (arg->flags & FUSE_EXPORT_SUPPORT)
+                                        fc->export_support = 1;
+                        }
                        if (arg->flags & FUSE_BIG_WRITES)
                                fc->big_writes = 1;
                } else {
@@ -607,7 +781,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-                FUSE_BIG_WRITES;
+                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -652,6 +826,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = FUSE_SUPER_MAGIC;
        sb->s_op = &fuse_super_operations;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_export_op = &fuse_export_operations;
        file = fget(d.fd);
        if (!file)
@@ -781,7 +956,7 @@ static inline void unregister_fuseblk(void)
 }
 #endif
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
 {
        struct inode * inode = foo;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6da0ab355b8a..8b0806a32948 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -448,7 +448,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
        struct qstr qstr;
        struct inode *inode;
        gfs2_str2qstr(&qstr, name);
-        inode = gfs2_lookupi(dip, &qstr, 1, NULL);
+        inode = gfs2_lookupi(dip, &qstr, 1);
        /* gfs2_lookupi has inconsistent callers: vfs
         * related routines expect NULL for no entry found,
         * gfs2_lookup_simple callers expect ENOENT
@@ -477,7 +477,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 */
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-                           int is_root, struct nameidata *nd)
+                           int is_root)
 {
        struct super_block *sb = dir->i_sb;
        struct gfs2_inode *dip = GFS2_I(dir);
@@ -1173,7 +1173,7 @@ int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
                        break;
                }
-                tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
+                tmp = gfs2_lookupi(dir, &dotdot, 1);
                if (IS_ERR(tmp)) {
                        error = PTR_ERR(tmp);
                        break;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6074c2506f75..58f9607d6a86 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,7 +83,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip);
 int gfs2_dinode_dealloc(struct gfs2_inode *inode);
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-                           int is_root, struct nameidata *nd);
+                           int is_root);
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
                           unsigned int mode, dev_t dev);
 int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bcc668d0fadd..bb2cc303ac29 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
 #include "util.h"
 #include "glock.h"
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
 {
        struct gfs2_inode *ip = foo;
@@ -33,7 +33,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
        ip->i_alloc = NULL;
 }
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
 {
        struct gfs2_glock *gl = foo;
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 990d9f4bc463..9cda8536530c 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -134,7 +134,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
        struct dentry *dentry;
        gfs2_str2qstr(&dotdot, "..");
-        inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+        inode = gfs2_lookupi(child->d_inode, &dotdot, 1);
        if (!inode)
                return ERR_PTR(-ENOENT);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1e252dfc5294..e2c62f73a778 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -74,7 +74,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
                        return PTR_ERR(inode);
                }
-                inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+                inode = gfs2_lookupi(dir, &dentry->d_name, 0);
                if (inode) {
                        if (!IS_ERR(inode)) {
                                gfs2_holder_uninit(ghs);
@@ -109,7 +109,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &gfs2_dops;
-        inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+        inode = gfs2_lookupi(dir, &dentry->d_name, 0);
        if (inode && IS_ERR(inode))
                return ERR_CAST(inode);
@@ -915,12 +915,6 @@ int gfs2_permission(struct inode *inode, int mask)
        return error;
 }
-static int gfs2_iop_permission(struct inode *inode, int mask,
-                               struct nameidata *nd)
-{
-        return gfs2_permission(inode, mask);
-}
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1150,7 +1144,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 const struct inode_operations gfs2_file_iops = {
-        .permission = gfs2_iop_permission,
+        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1169,7 +1163,7 @@ const struct inode_operations gfs2_dir_iops = {
        .rmdir = gfs2_rmdir,
        .mknod = gfs2_mknod,
        .rename = gfs2_rename,
-        .permission = gfs2_iop_permission,
+        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
@@ -1181,7 +1175,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
        .readlink = gfs2_readlink,
        .follow_link = gfs2_follow_link,
-        .permission = gfs2_iop_permission,
+        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
        .getattr = gfs2_getattr,
        .setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 63a8a902d9db..ca831991cbc2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -389,7 +389,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
-                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
                                error = -ENOENT;
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index 24e75798ddf0..c6e97366e8ac 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -145,7 +145,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
        if (!*num_bits)
                return 0;
-        down(&HFS_SB(sb)->bitmap_lock);
+        mutex_lock(&HFS_SB(sb)->bitmap_lock);
        bitmap = HFS_SB(sb)->bitmap;
        pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits);
@@ -162,7 +162,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
        HFS_SB(sb)->free_ablocks -= *num_bits;
        hfs_bitmap_dirty(sb);
 out:
-        up(&HFS_SB(sb)->bitmap_lock);
+        mutex_unlock(&HFS_SB(sb)->bitmap_lock);
        return pos;
 }
@@ -205,7 +205,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
        if ((start + count) > HFS_SB(sb)->fs_ablocks)
                return -2;
-        down(&HFS_SB(sb)->bitmap_lock);
+        mutex_lock(&HFS_SB(sb)->bitmap_lock);
        /* bitmap is always on a 32-bit boundary */
        curr = HFS_SB(sb)->bitmap + (start / 32);
        len = count;
@@ -236,7 +236,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
        }
 out:
        HFS_SB(sb)->free_ablocks += len;
-        up(&HFS_SB(sb)->bitmap_lock);
+        mutex_unlock(&HFS_SB(sb)->bitmap_lock);
        hfs_bitmap_dirty(sb);
        return 0;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index f6621a785202..9b9d6395bad3 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -40,7 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        {
        struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
        HFS_I(tree->inode)->flags = 0;
-        init_MUTEX(&HFS_I(tree->inode)->extents_lock);
+        mutex_init(&HFS_I(tree->inode)->extents_lock);
        switch (id) {
        case HFS_EXT_CNID:
                hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index c176f67ba0a5..2c16316d2917 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -343,16 +343,16 @@ int hfs_get_block(struct inode *inode, sector_t block,
                goto done;
        }
-        down(&HFS_I(inode)->extents_lock);
+        mutex_lock(&HFS_I(inode)->extents_lock);
        res = hfs_ext_read_extent(inode, ablock);
        if (!res)
                dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
                                            ablock - HFS_I(inode)->cached_start);
        else {
-                up(&HFS_I(inode)->extents_lock);
+                mutex_unlock(&HFS_I(inode)->extents_lock);
                return -EIO;
        }
-        up(&HFS_I(inode)->extents_lock);
+        mutex_unlock(&HFS_I(inode)->extents_lock);
 done:
        map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
@@ -375,7 +375,7 @@ int hfs_extend_file(struct inode *inode)
        u32 start, len, goal;
        int res;
-        down(&HFS_I(inode)->extents_lock);
+        mutex_lock(&HFS_I(inode)->extents_lock);
        if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
                goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
        else {
@@ -425,7 +425,7 @@ int hfs_extend_file(struct inode *inode)
                        goto insert_extent;
        }
 out:
-        up(&HFS_I(inode)->extents_lock);
+        mutex_unlock(&HFS_I(inode)->extents_lock);
        if (!res) {
                HFS_I(inode)->alloc_blocks += len;
                mark_inode_dirty(inode);
@@ -487,7 +487,7 @@ void hfs_file_truncate(struct inode *inode)
        if (blk_cnt == alloc_cnt)
                goto out;
-        down(&HFS_I(inode)->extents_lock);
+        mutex_lock(&HFS_I(inode)->extents_lock);
        hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
        while (1) {
                if (alloc_cnt == HFS_I(inode)->first_blocks) {
@@ -514,7 +514,7 @@ void hfs_file_truncate(struct inode *inode)
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
-        up(&HFS_I(inode)->extents_lock);
+        mutex_unlock(&HFS_I(inode)->extents_lock);
        HFS_I(inode)->alloc_blocks = blk_cnt;
 out:
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 147374b6f675..9955232fdf8c 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -11,6 +11,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
@@ -53,7 +54,7 @@ struct hfs_inode_info {
        struct list_head open_dir_list;
        struct inode *rsrc_inode;
-        struct semaphore extents_lock;
+        struct mutex extents_lock;
        u16 alloc_blocks, clump_blocks;
        sector_t fs_blocks;
@@ -139,7 +140,7 @@ struct hfs_sb_info {
        struct nls_table *nls_io, *nls_disk;
-        struct semaphore bitmap_lock;
+        struct mutex bitmap_lock;
        unsigned long flags;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 97f8446c4ff4..7e19835efa2e 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
        if (!inode)
                return NULL;
-        init_MUTEX(&HFS_I(inode)->extents_lock);
+        mutex_init(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
        hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
        inode->i_ino = HFS_SB(sb)->next_id++;
@@ -281,7 +281,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
-        init_MUTEX(&HFS_I(inode)->extents_lock);
+        mutex_init(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
        /* Initialize the inode */
@@ -511,8 +511,7 @@ void hfs_clear_inode(struct inode *inode)
        }
 }
-static int hfs_permission(struct inode *inode, int mask,
+static int hfs_permission(struct inode *inode, int mask)
-                          struct nameidata *nd)
 {
        if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
                return 0;
@@ -523,8 +522,6 @@ static int hfs_file_open(struct inode *inode, struct file *file)
 {
        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
-        if (atomic_read(&file->f_count) != 1)
-                return 0;
        atomic_inc(&HFS_I(inode)->opencnt);
        return 0;
 }
@@ -535,8 +532,6 @@ static int hfs_file_release(struct inode *inode, struct file *file)
        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
-        if (atomic_read(&file->f_count) != 0)
-                return 0;
        if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
                mutex_lock(&inode->i_mutex);
                hfs_file_truncate(inode);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 8cf67974adf6..4abb1047c689 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -372,7 +372,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_op = &hfs_super_operations;
        sb->s_flags |= MS_NODIRATIME;
-        init_MUTEX(&sbi->bitmap_lock);
+        mutex_init(&sbi->bitmap_lock);
        res = hfs_mdb_get(sb);
        if (res) {
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
 {
        struct hfs_inode_info *i = p;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 12e899cd7886..fec8f61227ff 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,16 +199,16 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
                goto done;
        }
-        down(&HFSPLUS_I(inode).extents_lock);
+        mutex_lock(&HFSPLUS_I(inode).extents_lock);
        res = hfsplus_ext_read_extent(inode, ablock);
        if (!res) {
                dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
                                             HFSPLUS_I(inode).cached_start);
        } else {
-                up(&HFSPLUS_I(inode).extents_lock);
+                mutex_unlock(&HFSPLUS_I(inode).extents_lock);
                return -EIO;
        }
-        up(&HFSPLUS_I(inode).extents_lock);
+        mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 done:
        dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
@@ -355,7 +355,7 @@ int hfsplus_file_extend(struct inode *inode)
                return -ENOSPC;
        }
-        down(&HFSPLUS_I(inode).extents_lock);
+        mutex_lock(&HFSPLUS_I(inode).extents_lock);
        if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
                goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
        else {
@@ -408,7 +408,7 @@ int hfsplus_file_extend(struct inode *inode)
                        goto insert_extent;
        }
 out:
-        up(&HFSPLUS_I(inode).extents_lock);
+        mutex_unlock(&HFSPLUS_I(inode).extents_lock);
        if (!res) {
                HFSPLUS_I(inode).alloc_blocks += len;
                mark_inode_dirty(inode);
@@ -465,7 +465,7 @@ void hfsplus_file_truncate(struct inode *inode)
        if (blk_cnt == alloc_cnt)
                goto out;
-        down(&HFSPLUS_I(inode).extents_lock);
+        mutex_lock(&HFSPLUS_I(inode).extents_lock);
        hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
        while (1) {
                if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
@@ -492,7 +492,7 @@ void hfsplus_file_truncate(struct inode *inode)
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
-        up(&HFSPLUS_I(inode).extents_lock);
+        mutex_unlock(&HFSPLUS_I(inode).extents_lock);
        HFSPLUS_I(inode).alloc_blocks = blk_cnt;
 out:
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 9e59537b43d5..f027a905225f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -11,6 +11,7 @@
 #define _LINUX_HFSPLUS_FS_H
 #include <linux/fs.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include "hfsplus_raw.h"
@@ -154,7 +155,7 @@ struct hfsplus_sb_info {
 struct hfsplus_inode_info {
-        struct semaphore extents_lock;
+        struct mutex extents_lock;
        u32 clump_blocks, alloc_blocks;
        sector_t fs_blocks;
        /* Allocation extents from catalog record or volume header */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 67e1c8b467c4..b085d64a2b67 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -163,7 +163,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
        inode->i_ino = dir->i_ino;
        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-        init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+        mutex_init(&HFSPLUS_I(inode).extents_lock);
        HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
        hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
@@ -238,7 +238,7 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
        perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
 }
-static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int hfsplus_permission(struct inode *inode, int mask)
 {
        /* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
         * open_exec has the same test, so it's still not executable, if a x bit
@@ -254,8 +254,6 @@ static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
        if (HFSPLUS_IS_RSRC(inode))
                inode = HFSPLUS_I(inode).rsrc_inode;
-        if (atomic_read(&file->f_count) != 1)
-                return 0;
        atomic_inc(&HFSPLUS_I(inode).opencnt);
        return 0;
 }
@@ -266,8 +264,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
        if (HFSPLUS_IS_RSRC(inode))
                inode = HFSPLUS_I(inode).rsrc_inode;
-        if (atomic_read(&file->f_count) != 0)
-                return 0;
        if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
                mutex_lock(&inode->i_mutex);
                hfsplus_file_truncate(inode);
@@ -316,7 +312,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
        inode->i_nlink = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-        init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+        mutex_init(&HFSPLUS_I(inode).extents_lock);
        atomic_set(&HFSPLUS_I(inode).opencnt, 0);
        HFSPLUS_I(inode).flags = 0;
        memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ce97a54518d8..e834e578c93f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -34,7 +34,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
                return inode;
        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-        init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+        mutex_init(&HFSPLUS_I(inode).extents_lock);
        HFSPLUS_I(inode).flags = 0;
        HFSPLUS_I(inode).rsrc_inode = NULL;
        atomic_set(&HFSPLUS_I(inode).opencnt, 0);
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
 {
        struct hfsplus_inode_info *i = p;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5222345ddccf..d6ecabf4d231 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -822,7 +822,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        return err;
 }
-int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
+int hostfs_permission(struct inode *ino, int desired)
 {
        char *name;
        int r = 0, w = 0, x = 0, err;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d256559b4104..d9c59a775449 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -415,7 +415,7 @@ again:
                d_drop(dentry);
                spin_lock(&dentry->d_lock);
                if (atomic_read(&dentry->d_count) > 1 ||
-                    permission(inode, MAY_WRITE, NULL) ||
+                    generic_permission(inode, MAY_WRITE, NULL) ||
                    !S_ISREG(inode->i_mode) ||
                    get_write_access(inode)) {
                        spin_unlock(&dentry->d_lock);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec659..b8ae9c90ada0 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
        kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 65077aa90f0a..2b3d1828db99 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -655,20 +655,13 @@ static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 }
-int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-        return generic_permission(inode, mask, NULL);
-}
 static const struct inode_operations hppfs_dir_iops = {
        .lookup         = hppfs_lookup,
-        .permission     = hppfs_permission,
 };
 static const struct inode_operations hppfs_link_iops = {
        .readlink       = hppfs_readlink,
        .follow_link    = hppfs_follow_link,
-        .permission     = hppfs_permission,
 };
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dbd01d262ca4..3f58923fb39b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -705,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
diff --git a/fs/inode.c b/fs/inode.c
index c36d9480335c..0487ddba1397 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
                mapping->assoc_mapping = NULL;
                mapping->backing_dev_info = &default_backing_dev_info;
+                mapping->writeback_index = 0;
                /*
                 * If the block_device provides a backing_dev_info for client
@@ -209,7 +210,7 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_dentry);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-        rwlock_init(&inode->i_data.tree_lock);
+        spin_lock_init(&inode->i_data.tree_lock);
        spin_lock_init(&inode->i_data.i_mmap_lock);
        INIT_LIST_HEAD(&inode->i_data.private_list);
        spin_lock_init(&inode->i_data.private_lock);
@@ -224,7 +225,7 @@ void inode_init_once(struct inode *inode)
 EXPORT_SYMBOL(inode_init_once);
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct inode * inode = (struct inode *) foo;
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index fe79c25d95dc..60249429a253 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -354,20 +354,20 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
 }
 /*
- * find_inode - resolve a user-given path to a specific inode and return a nd
+ * find_inode - resolve a user-given path to a specific inode
 */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
+static int find_inode(const char __user *dirname, struct path *path,
                      unsigned flags)
 {
        int error;
-        error = __user_walk(dirname, flags, nd);
+        error = user_path_at(AT_FDCWD, dirname, flags, path);
        if (error)
                return error;
        /* you can only watch an inode if you have read permissions on it */
-        error = vfs_permission(nd, MAY_READ);
+        error = inode_permission(path->dentry->d_inode, MAY_READ);
        if (error)
-                path_put(&nd->path);
+                path_put(path);
        return error;
 }
@@ -650,11 +650,11 @@ asmlinkage long sys_inotify_init(void)
        return sys_inotify_init1(0);
 }
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
 {
        struct inode *inode;
        struct inotify_device *dev;
-        struct nameidata nd;
+        struct path path;
        struct file *filp;
        int ret, fput_needed;
        unsigned flags = 0;
@@ -674,12 +674,12 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
        if (mask & IN_ONLYDIR)
                flags |= LOOKUP_DIRECTORY;
-        ret = find_inode(path, &nd, flags);
+        ret = find_inode(pathname, &path, flags);
        if (unlikely(ret))
                goto fput_and_out;
-        /* inode held in place by reference to nd; dev by fget on fd */
+        /* inode held in place by reference to path; dev by fget on fd */
-        inode = nd.path.dentry->d_inode;
+        inode = path.dentry->d_inode;
        dev = filp->private_data;
        mutex_lock(&dev->up_mutex);
@@ -688,7 +688,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
                ret = create_watch(dev, inode, mask);
        mutex_unlock(&dev->up_mutex);
-        path_put(&nd.path);
+        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
        return ret;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c4a1c3c65aac..da3cc460d4df 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
                                        break;
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
@@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                pgrp = task_pgrp(current);
                        else
                                pgrp = find_vpid(who);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
                                        continue;
@@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                        ret = tmpio;
                                else
                                        ret = ioprio_best(ret, tmpio);
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526b..26948a6033b6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
        kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct iso_inode_info *ei = foo;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 6bd48f0a7047..c2fb2dd0131f 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -209,6 +209,11 @@ repeat:
        while (rs.len > 2) { /* There may be one byte for padding somewhere */
                rr = (struct rock_ridge *)rs.chr;
+                /*
+                 * Ignore rock ridge info if rr->len is out of range, but
+                 * don't return -EIO because that would make the file
+                 * invisible.
+                 */
                if (rr->len < 3)
                        goto out;       /* Something got screwed up here */
                sig = isonum_721(rs.chr);
@@ -216,8 +221,12 @@ repeat:
                        goto eio;
                rs.chr += rr->len;
                rs.len -= rr->len;
+                /*
+                 * As above, just ignore the rock ridge info if rr->len
+                 * is bogus.
+                 */
                if (rs.len < 0)
-                        goto eio;       /* corrupted isofs */
+                        goto out;       /* Something got screwed up here */
                switch (sig) {
                case SIG('R', 'R'):
@@ -307,6 +316,11 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
 repeat:
        while (rs.len > 2) { /* There may be one byte for padding somewhere */
                rr = (struct rock_ridge *)rs.chr;
+                /*
+                 * Ignore rock ridge info if rr->len is out of range, but
+                 * don't return -EIO because that would make the file
+                 * invisible.
+                 */
                if (rr->len < 3)
                        goto out;       /* Something got screwed up here */
                sig = isonum_721(rs.chr);
@@ -314,8 +328,12 @@ repeat:
                        goto eio;
                rs.chr += rr->len;
                rs.len -= rr->len;
+                /*
+                 * As above, just ignore the rock ridge info if rr->len
+                 * is bogus.
+                 */
                if (rs.len < 0)
-                        goto eio;       /* corrupted isofs */
+                        goto out;       /* Something got screwed up here */
                switch (sig) {
 #ifndef CONFIG_ZISOFS           /* No flag for SF or ZF */
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 5a8ca61498ca..ae08c057e751 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 /*
 * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
 * After the transaction commits, these pages are left on the LRU, with no
 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 * So here, we have a buffer which has just come off the forget list.  Look to
 * see if we can strip all buffers from the backing page.
 *
- * Called under lock_journal(), and possibly under journal_datalist_lock.  The
+ * Called under journal->j_list_lock.  The caller provided us with a ref
- * caller provided us with a ref against the buffer, and we drop that here.
+ * against the buffer, and we drop that here.
 */
 static void release_buffer_page(struct buffer_head *bh)
 {
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
                goto nope;
        /* OK, it's a truncated page */
-        if (TestSetPageLocked(page))
+        if (!trylock_page(page))
                goto nope;
        page_cache_get(page);
@@ -78,6 +78,19 @@ nope:
 }
 /*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+        if (buffer_freed(bh)) {
+                clear_buffer_freed(bh);
+                release_buffer_page(bh);
+        } else
+                put_bh(bh);
+}
+/*
 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 * return 0.  j_list_lock is dropped in this case.
@@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 /*
 *  Submit all the data buffers to disk
 */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
                                transaction_t *commit_transaction)
 {
        struct journal_head *jh;
@@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal,
        int locked;
        int bufs = 0;
        struct buffer_head **wbuf = journal->j_wbuf;
+        int err = 0;
        /*
         * Whenever we unlock the journal and sleep, things can get added
@@ -207,7 +221,7 @@ write_out_data:
                 * blocking lock_buffer().
                 */
                if (buffer_dirty(bh)) {
-                        if (test_set_buffer_locked(bh)) {
+                        if (!trylock_buffer(bh)) {
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
                                /* Write out all data to prevent deadlocks */
@@ -231,7 +245,7 @@ write_out_data:
                        if (locked)
                                unlock_buffer(bh);
                        BUFFER_TRACE(bh, "already cleaned up");
-                        put_bh(bh);
+                        release_data_buffer(bh);
                        continue;
                }
                if (locked && test_clear_buffer_dirty(bh)) {
@@ -253,15 +267,17 @@ write_out_data:
                        put_bh(bh);
                } else {
                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        if (unlikely(!buffer_uptodate(bh)))
+                                err = -EIO;
                        __journal_unfile_buffer(jh);
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
                        journal_remove_journal_head(bh);
-                        /* Once for our safety reference, once for
+                        /* One for our safety reference, other for
                         * journal_remove_journal_head() */
                        put_bh(bh);
-                        put_bh(bh);
+                        release_data_buffer(bh);
                }
                if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -271,6 +287,8 @@ write_out_data:
        }
        spin_unlock(&journal->j_list_lock);
        journal_do_submit_data(wbuf, bufs);
+        return err;
 }
 /*
@@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-        err = 0;
+        err = journal_submit_data_buffers(journal, commit_transaction);
-        journal_submit_data_buffers(journal, commit_transaction);
        /*
         * Wait for all previously submitted IO to complete.
@@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal)
                if (buffer_locked(bh)) {
                        spin_unlock(&journal->j_list_lock);
                        wait_on_buffer(bh);
-                        if (unlikely(!buffer_uptodate(bh)))
-                                err = -EIO;
                        spin_lock(&journal->j_list_lock);
                }
+                if (unlikely(!buffer_uptodate(bh))) {
+                        if (!trylock_page(bh->b_page)) {
+                                spin_unlock(&journal->j_list_lock);
+                                lock_page(bh->b_page);
+                                spin_lock(&journal->j_list_lock);
+                        }
+                        if (bh->b_page->mapping)
+                                set_bit(AS_EIO, &bh->b_page->mapping->flags);
+                        unlock_page(bh->b_page);
+                        SetPageError(bh->b_page);
+                        err = -EIO;
+                }
                if (!inverted_lock(journal, bh)) {
                        put_bh(bh);
                        spin_lock(&journal->j_list_lock);
@@ -443,17 +471,21 @@ void journal_commit_transaction(journal_t *journal)
                } else {
                        jbd_unlock_bh_state(bh);
                }
-                put_bh(bh);
+                release_data_buffer(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
-        if (err)
+        if (err) {
-                journal_abort(journal, err);
+                char b[BDEVNAME_SIZE];
-        journal_write_revoke_records(journal, commit_transaction);
+                printk(KERN_WARNING
+                        "JBD: Detected IO errors while flushing file data "
+                        "on %s\n", bdevname(journal->j_fs_dev, b));
+                err = 0;
+        }
-        jbd_debug(3, "JBD: commit phase 2\n");
+        journal_write_revoke_records(journal, commit_transaction);
        /*
         * If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b99c3b3654c4..aa7143a8349b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
 EXPORT_SYMBOL(journal_ack_err);
@@ -1636,9 +1635,10 @@ static int journal_init_journal_head_cache(void)
 static void journal_destroy_journal_head_cache(void)
 {
-        J_ASSERT(journal_head_cache != NULL);
+        if (journal_head_cache) {
-        kmem_cache_destroy(journal_head_cache);
+                kmem_cache_destroy(journal_head_cache);
-        journal_head_cache = NULL;
+                journal_head_cache = NULL;
+        }
 }
 /*
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 1bb43e987f4b..c7bd649bbbdc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
        return NULL;
 }
+void journal_destroy_revoke_caches(void)
+{
+        if (revoke_record_cache) {
+                kmem_cache_destroy(revoke_record_cache);
+                revoke_record_cache = NULL;
+        }
+        if (revoke_table_cache) {
+                kmem_cache_destroy(revoke_table_cache);
+                revoke_table_cache = NULL;
+        }
+}
 int __init journal_init_revoke_caches(void)
 {
+        J_ASSERT(!revoke_record_cache);
+        J_ASSERT(!revoke_table_cache);
        revoke_record_cache = kmem_cache_create("revoke_record",
                                           sizeof(struct jbd_revoke_record_s),
                                           0,
                                           SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                           NULL);
        if (!revoke_record_cache)
-                return -ENOMEM;
+                goto record_cache_failure;
        revoke_table_cache = kmem_cache_create("revoke_table",
                                           sizeof(struct jbd_revoke_table_s),
                                           0, SLAB_TEMPORARY, NULL);
-        if (!revoke_table_cache) {
+        if (!revoke_table_cache)
-                kmem_cache_destroy(revoke_record_cache);
+                goto table_cache_failure;
-                revoke_record_cache = NULL;
-                return -ENOMEM;
-        }
        return 0;
-}
-void journal_destroy_revoke_caches(void)
+table_cache_failure:
-{
+        journal_destroy_revoke_caches();
-        kmem_cache_destroy(revoke_record_cache);
+record_cache_failure:
-        revoke_record_cache = NULL;
+        return -ENOMEM;
-        kmem_cache_destroy(revoke_table_cache);
-        revoke_table_cache = NULL;
 }
-/* Initialise the revoke table for a given journal to a given size. */
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
-int journal_init_revoke(journal_t *journal, int hash_size)
 {
-        int shift, tmp;
+        int shift = 0;
+        int tmp = hash_size;
+        struct jbd_revoke_table_s *table;
-        J_ASSERT (journal->j_revoke_table[0] == NULL);
+        table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+        if (!table)
+                goto out;
-        shift = 0;
-        tmp = hash_size;
        while((tmp >>= 1UL) != 0UL)
                shift++;
-        journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+        table->hash_size = hash_size;
-        if (!journal->j_revoke_table[0])
+        table->hash_shift = shift;
-                return -ENOMEM;
+        table->hash_table =
-        journal->j_revoke = journal->j_revoke_table[0];
-        /* Check that the hash_size is a power of two */
-        J_ASSERT(is_power_of_2(hash_size));
-        journal->j_revoke->hash_size = hash_size;
-        journal->j_revoke->hash_shift = shift;
-        journal->j_revoke->hash_table =
                kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-        if (!journal->j_revoke->hash_table) {
+        if (!table->hash_table) {
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
+                kmem_cache_free(revoke_table_cache, table);
-                journal->j_revoke = NULL;
+                table = NULL;
-                return -ENOMEM;
+                goto out;
        }
        for (tmp = 0; tmp < hash_size; tmp++)
-                INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+                INIT_LIST_HEAD(&table->hash_table[tmp]);
-        journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+out:
-        if (!journal->j_revoke_table[1]) {
+        return table;
-                kfree(journal->j_revoke_table[0]->hash_table);
+}
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-                return -ENOMEM;
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+        int i;
+        struct list_head *hash_list;
+        for (i = 0; i < table->hash_size; i++) {
+                hash_list = &table->hash_table[i];
+                J_ASSERT(list_empty(hash_list));
        }
-        journal->j_revoke = journal->j_revoke_table[1];
+        kfree(table->hash_table);
+        kmem_cache_free(revoke_table_cache, table);
+}
-        /* Check that the hash_size is a power of two */
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+        J_ASSERT(journal->j_revoke_table[0] == NULL);
        J_ASSERT(is_power_of_2(hash_size));
-        journal->j_revoke->hash_size = hash_size;
+        journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+        if (!journal->j_revoke_table[0])
+                goto fail0;
-        journal->j_revoke->hash_shift = shift;
+        journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+        if (!journal->j_revoke_table[1])
+                goto fail1;
-        journal->j_revoke->hash_table =
+        journal->j_revoke = journal->j_revoke_table[1];
-                kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-        if (!journal->j_revoke->hash_table) {
-                kfree(journal->j_revoke_table[0]->hash_table);
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-                kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
-                journal->j_revoke = NULL;
-                return -ENOMEM;
-        }
-        for (tmp = 0; tmp < hash_size; tmp++)
-                INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
        spin_lock_init(&journal->j_revoke_lock);
        return 0;
-}
-/* Destoy a journal's revoke table.  The table must already be empty! */
+fail1:
+        journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+        return -ENOMEM;
+}
+/* Destroy a journal's revoke table.  The table must already be empty! */
 void journal_destroy_revoke(journal_t *journal)
 {
-        struct jbd_revoke_table_s *table;
-        struct list_head *hash_list;
-        int i;
-        table = journal->j_revoke_table[0];
-        if (!table)
-                return;
-        for (i=0; i<table->hash_size; i++) {
-                hash_list = &table->hash_table[i];
-                J_ASSERT (list_empty(hash_list));
-        }
-        kfree(table->hash_table);
-        kmem_cache_free(revoke_table_cache, table);
-        journal->j_revoke = NULL;
-        table = journal->j_revoke_table[1];
-        if (!table)
-                return;
-        for (i=0; i<table->hash_size; i++) {
-                hash_list = &table->hash_table[i];
-                J_ASSERT (list_empty(hash_list));
-        }
-        kfree(table->hash_table);
-        kmem_cache_free(revoke_table_cache, table);
        journal->j_revoke = NULL;
+        if (journal->j_revoke_table[0])
+                journal_destroy_revoke_table(journal->j_revoke_table[0]);
+        if (journal->j_revoke_table[1])
+                journal_destroy_revoke_table(journal->j_revoke_table[1]);
 }
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 67ff2024c23c..0540ca27a446 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                goto out;
        }
-        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
@@ -1448,7 +1448,7 @@ int journal_stop(handle_t *handle)
                spin_unlock(&journal->j_state_lock);
        }
-        lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&handle->h_lockdep_map);
        jbd_free_handle(handle);
        return err;
@@ -1648,12 +1648,42 @@ out:
        return;
 }
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The latter might still hold the a count on buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * tryinf to free that buffer.
+ *
+ * Called with journal->j_state_lock held.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+        transaction_t *transaction = NULL;
+        tid_t tid;
+        spin_lock(&journal->j_state_lock);
+        transaction = journal->j_committing_transaction;
+        if (!transaction) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
+        tid = transaction->t_tid;
+        spin_unlock(&journal->j_state_lock);
+        log_wait_commit(journal, tid);
+}
 /**
 * int journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
 *
 *
 * For all the buffers on this page,
@@ -1682,9 +1712,11 @@ out:
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
 */
 int journal_try_to_free_buffers(journal_t *journal,
-                                struct page *page, gfp_t unused_gfp_mask)
+                                struct page *page, gfp_t gfp_mask)
 {
        struct buffer_head *head;
        struct buffer_head *bh;
@@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_t *journal,
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
        ret = try_to_free_buffers(page);
+        /*
+         * There are a number of places where journal_try_to_free_buffers()
+         * could race with journal_commit_transaction(), the later still
+         * holds the reference to the buffers to free while processing them.
+         * try_to_free_buffers() failed to free those buffers. Some of the
+         * caller of releasepage() request page buffers to be dropped, otherwise
+         * treat the fail-to-free as errors (such as generic_file_direct_IO())
+         *
+         * So, if the caller of try_to_release_page() wants the synchronous
+         * behaviour(i.e make sure buffers are dropped upon return),
+         * let's wait for the current transaction to finish flush of
+         * dirty data buffers, then try to free those buffers again,
+         * with the journal locked.
+         */
+        if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+                journal_wait_for_transaction_sync_data(journal);
+                ret = try_to_free_buffers(page);
+        }
 busy:
        return ret;
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f8b3be873226..f2ad061e95ec 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -67,7 +67,7 @@ static void release_buffer_page(struct buffer_head *bh)
                goto nope;
        /* OK, it's a truncated page */
-        if (TestSetPageLocked(page))
+        if (!trylock_page(page))
                goto nope;
        page_cache_get(page);
@@ -262,8 +262,18 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
                jinode->i_flags |= JI_COMMIT_RUNNING;
                spin_unlock(&journal->j_list_lock);
                err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
-                if (!ret)
+                if (err) {
-                        ret = err;
+                        /*
+                         * Because AS_EIO is cleared by
+                         * wait_on_page_writeback_range(), set it again so
+                         * that user process can get -EIO from fsync().
+                         */
+                        set_bit(AS_EIO,
+                                &jinode->i_vfs_inode->i_mapping->flags);
+                        if (!ret)
+                                ret = err;
+                }
                spin_lock(&journal->j_list_lock);
                jinode->i_flags &= ~JI_COMMIT_RUNNING;
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -670,8 +680,14 @@ start_journal_io:
         * commit block, which happens below in such setting.
         */
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
-        if (err)
+        if (err) {
-                jbd2_journal_abort(journal, err);
+                char b[BDEVNAME_SIZE];
+                printk(KERN_WARNING
+                        "JBD2: Detected IO errors while flushing file data "
+                        "on %s\n", bdevname(journal->j_fs_dev, b));
+                err = 0;
+        }
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b26c6d9fe6ae..8207a01c4edb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
 EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
-EXPORT_SYMBOL(jbd2_journal_update_superblock);
 EXPORT_SYMBOL(jbd2_journal_abort);
 EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f7cadbb19fa..e5d540588fa9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -301,7 +301,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
                goto out;
        }
-        lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
 }
@@ -1279,7 +1279,7 @@ int jbd2_journal_stop(handle_t *handle)
                spin_unlock(&journal->j_state_lock);
        }
-        lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&handle->h_lockdep_map);
        jbd2_free_handle(handle);
        return err;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 4c80404a9aba..d98713777a1b 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -314,7 +314,7 @@ static int jffs2_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jffs2_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, jffs2_check_acl);
 }
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 0bb7f003fd80..8ca058aed384 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
 #define JFFS2_ACL_NOT_CACHED ((void *)-1)
-extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_permission(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0c141f6fde1..cd219ef55254 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -38,7 +38,7 @@ const struct file_operations jffs2_dir_operations =
 {
        .read =         generic_read_dir,
        .readdir =      jffs2_readdir,
-        .ioctl =        jffs2_ioctl,
+        .unlocked_ioctl=jffs2_ioctl,
        .fsync =        jffs2_fsync
 };
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5e920343b2c5..5a98aa87c853 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -46,7 +46,7 @@ const struct file_operations jffs2_file_operations =
        .aio_read =     generic_file_aio_read,
        .write =        do_sync_write,
        .aio_write =    generic_file_aio_write,
-        .ioctl =        jffs2_ioctl,
+        .unlocked_ioctl=jffs2_ioctl,
        .mmap =         generic_file_readonly_mmap,
        .fsync =        jffs2_fsync,
        .splice_read =  generic_file_splice_read,
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index e2177210f621..9d41f43e47bb 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -12,8 +12,7 @@
 #include <linux/fs.h>
 #include "nodelist.h"
-int jffs2_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                unsigned long arg)
 {
        /* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which
           will include compression support etc. */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdde..4c41db91eaa4 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
 #ifndef _JFFS2_FS_I
 #define _JFFS2_FS_I
-#include <linux/version.h>
 #include <linux/rbtree.h>
 #include <linux/posix_acl.h>
 #include <linux/mutex.h>
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 2cc866cf134f..5e194a5c8e29 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -167,7 +167,7 @@ int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 /* ioctl.c */
-int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+long jffs2_ioctl(struct file *, unsigned int, unsigned long);
 /* symlink.c */
 extern const struct inode_operations jffs2_symlink_inode_operations;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 629af01e5ade..6caf1e1ee26d 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,6 +23,8 @@
 int jffs2_sum_init(struct jffs2_sb_info *c)
 {
+        uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
        c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
        if (!c->summary) {
@@ -30,7 +32,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
                return -ENOMEM;
        }
-        c->summary->sum_buf = vmalloc(c->sector_size);
+        c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL);
        if (!c->summary->sum_buf) {
                JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n");
@@ -49,7 +51,7 @@ void jffs2_sum_exit(struct jffs2_sb_info *c)
        jffs2_sum_disable_collecting(c->summary);
-        vfree(c->summary->sum_buf);
+        kfree(c->summary->sum_buf);
        c->summary->sum_buf = NULL;
        kfree(c->summary);
@@ -665,7 +667,7 @@ crc_err:
 /* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */
 static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-                                        uint32_t infosize, uint32_t datasize, int padsize)
+                                uint32_t infosize, uint32_t datasize, int padsize)
 {
        struct jffs2_raw_summary isum;
        union jffs2_sum_mem *temp;
@@ -676,6 +678,26 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
        int ret;
        size_t retlen;
+        if (padsize + datasize > MAX_SUMMARY_SIZE) {
+                /* It won't fit in the buffer. Abort summary for this jeb */
+                jffs2_sum_disable_collecting(c->summary);
+                JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n",
+                              datasize, padsize, jeb->offset);
+                /* Non-fatal */
+                return 0;
+        }
+        /* Is there enough space for summary? */
+        if (padsize < 0) {
+                /* don't try to write out summary for this jeb */
+                jffs2_sum_disable_collecting(c->summary);
+                JFFS2_WARNING("Not enough space for summary, padsize = %d\n",
+                              padsize);
+                /* Non-fatal */
+                return 0;
+        }
        memset(c->summary->sum_buf, 0xff, datasize);
        memset(&isum, 0, sizeof(isum));
@@ -821,7 +843,7 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 {
        int datasize, infosize, padsize;
        struct jffs2_eraseblock *jeb;
-        int ret;
+        int ret = 0;
        dbg_summary("called\n");
@@ -841,16 +863,6 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
        infosize += padsize;
        datasize += padsize;
-        /* Is there enough space for summary? */
-        if (padsize < 0) {
-                /* don't try to write out summary for this jeb */
-                jffs2_sum_disable_collecting(c->summary);
-                JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize);
-                spin_lock(&c->erase_completion_lock);
-                return 0;
-        }
        ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
        spin_lock(&c->erase_completion_lock);
        return ret;
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 8bf34f2fa5ce..60207a2ae952 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -13,6 +13,12 @@
 #ifndef JFFS2_SUMMARY_H
 #define JFFS2_SUMMARY_H
+/* Limit summary size to 64KiB so that we can kmalloc it. If the summary
+   is larger than that, we have to just ditch it and avoid using summary
+   for the eraseblock in question... and it probably doesn't hurt us much
+   anyway. */
+#define MAX_SUMMARY_SIZE 65536
 #include <linux/uio.h>
 #include <linux/jffs2.h>
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e4..efd401257ed9 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
        kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
 {
        struct jffs2_inode_info *f = foo;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 4d84bdc88299..d3e5c33665de 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -140,7 +140,7 @@ static int jfs_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jfs_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, jfs_check_acl);
 }
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 455fa4292045..88475f10a389 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
-int jfs_permission(struct inode *, int, struct nameidata *);
+int jfs_permission(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 854ff0ec574f..c350057087dd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -182,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 #endif
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct metapage *mp = (struct metapage *)foo;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0288e6d7936a..3630718be395 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -22,6 +22,7 @@
 #include <linux/parser.h>
 #include <linux/completion.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mount.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
@@ -759,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
diff --git a/fs/libfs.c b/fs/libfs.c
index baeb71ee1cde..1add676a19df 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -216,8 +216,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
        s->s_flags = MS_NOUSER;
        s->s_maxbytes = ~0ULL;
-        s->s_blocksize = 1024;
+        s->s_blocksize = PAGE_SIZE;
-        s->s_blocksize_bits = 10;
+        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
        s->s_op = ops ? ops : &simple_super_operations;
        s->s_time_gran = 1;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 1f6dc518505c..31668b690e03 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -582,7 +582,15 @@ again:
        }
        if (status < 0)
                goto out_unlock;
-        status = nlm_stat_to_errno(resp->status);
+        /*
+         * EAGAIN doesn't make sense for sleeping locks, and in some
+         * cases NLM_LCK_DENIED is returned for a permanent error.  So
+         * turn it into an ENOLCK.
+         */
+        if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+                status = -ENOLCK;
+        else
+                status = nlm_stat_to_errno(resp->status);
 out_unblock:
        nlmclnt_finish_block(block);
 out:
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 399444639337..4a714f64515b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -83,7 +83,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: TEST4        called\n");
        resp->cookie = argp->cookie;
@@ -116,7 +116,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: LOCK          called\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 821b9acdfb66..cf0d5c2c318d 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -418,8 +418,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        goto out;
                case -EAGAIN:
                        ret = nlm_lck_denied;
-                        break;
+                        goto out;
-                case -EINPROGRESS:
+                case FILE_LOCK_DEFERRED:
                        if (wait)
                                break;
                        /* Filesystem lock operation is in progress
@@ -434,10 +434,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
                        goto out;
        }
-        ret = nlm_lck_denied;
-        if (!wait)
-                goto out;
        ret = nlm_lck_blocked;
        /* Append to list of blocked */
@@ -507,7 +503,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
        }
        error = vfs_test_lock(file->f_file, &lock->fl);
-        if (error == -EINPROGRESS) {
+        if (error == FILE_LOCK_DEFERRED) {
                ret = nlmsvc_defer_lock_rqst(rqstp, block);
                goto out;
        }
@@ -731,8 +727,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
        switch (error) {
        case 0:
                break;
-        case -EAGAIN:
+        case FILE_LOCK_DEFERRED:
-        case -EINPROGRESS:
                dprintk("lockd: lock still blocked error %d\n", error);
                nlmsvc_insert_block(block, NLM_NEVER);
                nlmsvc_release_block(block);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76019d2ff72d..76262c1986f2 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -112,7 +112,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: TEST          called\n");
        resp->cookie = argp->cookie;
@@ -146,7 +146,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
        struct nlm_host *host;
        struct nlm_file *file;
-        int rc = rpc_success;
+        __be32 rc = rpc_success;
        dprintk("lockd: LOCK          called\n");
diff --git a/fs/locks.c b/fs/locks.c
index dce8c747371c..5eb259e3cd38 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
 * Initialises the fields of the file lock which are invariant for
 * free file_locks.
 */
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
 {
        struct file_lock *lock = (struct file_lock *) foo;
@@ -779,8 +779,10 @@ find_conflict:
                if (!flock_locks_conflict(request, fl))
                        continue;
                error = -EAGAIN;
-                if (request->fl_flags & FL_SLEEP)
+                if (!(request->fl_flags & FL_SLEEP))
-                        locks_insert_block(fl, request);
+                        goto out;
+                error = FILE_LOCK_DEFERRED;
+                locks_insert_block(fl, request);
                goto out;
        }
        if (request->fl_flags & FL_ACCESS)
@@ -836,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        error = -EDEADLK;
                        if (posix_locks_deadlock(request, fl))
                                goto out;
-                        error = -EAGAIN;
+                        error = FILE_LOCK_DEFERRED;
                        locks_insert_block(fl, request);
                        goto out;
                }
@@ -1035,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
        might_sleep ();
        for (;;) {
                error = posix_lock_file(filp, fl, NULL);
-                if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
                if (!error)
@@ -1107,9 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
        for (;;) {
                error = __posix_lock_file(inode, &fl, NULL);
-                if (error != -EAGAIN)
+                if (error != FILE_LOCK_DEFERRED)
-                        break;
-                if (!(fl.fl_flags & FL_SLEEP))
                        break;
                error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
                if (!error) {
@@ -1531,7 +1531,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
        might_sleep();
        for (;;) {
                error = flock_lock_file(filp, fl);
-                if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
                if (!error)
@@ -1716,17 +1716,17 @@ out:
 * fl_grant is set. Callers expecting ->lock() to return asynchronously
 * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
 * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return -EINPROGRESS, and call ->fl_grant() when the lock
+ * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock
 * request completes.
 * If the request is for non-blocking lock the file system should return
- * -EINPROGRESS then try to get the lock and call the callback routine with
+ * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
- * the result. If the request timed out the callback routine will return a
+ * with the result. If the request timed out the callback routine will return a
 * nonzero return code and the file system should release the lock. The file
 * system is also responsible to keep a corresponding posix lock when it
 * grants a lock so the VFS can find out which locks are locally held and do
 * the correct lock cleanup when required.
 * The underlying filesystem must not drop the kernel lock or call
- * ->fl_grant() before returning to the caller with a -EINPROGRESS
+ * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED
 * return code.
 */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
@@ -1738,6 +1738,30 @@ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, str
 }
 EXPORT_SYMBOL_GPL(vfs_lock_file);
+static int do_lock_file_wait(struct file *filp, unsigned int cmd,
+                             struct file_lock *fl)
+{
+        int error;
+        error = security_file_lock(filp, fl->fl_type);
+        if (error)
+                return error;
+        for (;;) {
+                error = vfs_lock_file(filp, cmd, fl, NULL);
+                if (error != FILE_LOCK_DEFERRED)
+                        break;
+                error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+                if (!error)
+                        continue;
+                locks_delete_block(fl);
+                break;
+        }
+        return error;
+}
 /* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
@@ -1795,26 +1819,7 @@ again:
                goto out;
        }
-        error = security_file_lock(filp, file_lock->fl_type);
+        error = do_lock_file_wait(filp, cmd, file_lock);
-        if (error)
-                goto out;
-        if (filp->f_op && filp->f_op->lock != NULL)
-                error = filp->f_op->lock(filp, cmd, file_lock);
-        else {
-                for (;;) {
-                        error = posix_lock_file(filp, file_lock, NULL);
-                        if (error != -EAGAIN || cmd == F_SETLK)
-                                break;
-                        error = wait_event_interruptible(file_lock->fl_wait,
-                                        !file_lock->fl_next);
-                        if (!error)
-                                continue;
-                        locks_delete_block(file_lock);
-                        break;
-                }
-        }
        /*
         * Attempt to detect a close/fcntl race and recover by
@@ -1932,26 +1937,7 @@ again:
                goto out;
        }
-        error = security_file_lock(filp, file_lock->fl_type);
+        error = do_lock_file_wait(filp, cmd, file_lock);
-        if (error)
-                goto out;
-        if (filp->f_op && filp->f_op->lock != NULL)
-                error = filp->f_op->lock(filp, cmd, file_lock);
-        else {
-                for (;;) {
-                        error = posix_lock_file(filp, file_lock, NULL);
-                        if (error != -EAGAIN || cmd == F_SETLK64)
-                                break;
-                        error = wait_event_interruptible(file_lock->fl_wait,
-                                        !file_lock->fl_next);
-                        if (!error)
-                                continue;
-                        locks_delete_block(file_lock);
-                        break;
-                }
-        }
        /*
         * Attempt to detect a close/fcntl race and recover by
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 84f6242ba6fc..d1d1eb84679d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
        kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;
@@ -256,9 +256,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
        if (!s->s_root)
                goto out_iput;
-        if (!NO_TRUNCATE)
-                s->s_root->d_op = &minix_dentry_operations;
        if (!(s->s_flags & MS_RDONLY)) {
                if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
                        ms->s_state &= ~MINIX_VALID_FS;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 326edfe96108..e6a0b193bea4 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -2,11 +2,6 @@
 #include <linux/pagemap.h>
 #include <linux/minix_fs.h>
-/*
- * change the define below to 0 if you want names > info->s_namelen chars to be
- * truncated. Else they will be disallowed (ENAMETOOLONG).
- */
-#define NO_TRUNCATE 1
 #define INODE_VERSION(inode)    minix_sb(inode->i_sb)->s_version
 #define MINIX_V1                0x0001          /* original minix fs */
 #define MINIX_V2                0x0002          /* minix V2 fs */
@@ -83,7 +78,6 @@ extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
 extern const struct file_operations minix_file_operations;
 extern const struct file_operations minix_dir_operations;
-extern struct dentry_operations minix_dentry_operations;
 static inline struct minix_sb_info *minix_sb(struct super_block *sb)
 {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 102241bc9c79..32b131cd6121 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -18,30 +18,6 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
        return err;
 }
-static int minix_hash(struct dentry *dentry, struct qstr *qstr)
-{
-        unsigned long hash;
-        int i;
-        const unsigned char *name;
-        i = minix_sb(dentry->d_inode->i_sb)->s_namelen;
-        if (i >= qstr->len)
-                return 0;
-        /* Truncate the name in place, avoids having to define a compare
-           function. */
-        qstr->len = i;
-        name = qstr->name;
-        hash = init_name_hash();
-        while (i--)
-                hash = partial_name_hash(*name++, hash);
-        qstr->hash = end_name_hash(hash);
-        return 0;
-}
-struct dentry_operations minix_dentry_operations = {
-        .d_hash         = minix_hash,
-};
 static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct inode * inode = NULL;
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 1f7f2956412a..e844b9809d27 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -14,12 +14,7 @@
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict_pc[] = "+=,; ";
+static unsigned char bad_if_strict[] = "+=,; ";
-/* GEMDOS is less restrictive */
-static unsigned char bad_if_strict_atari[] = " ";
-#define bad_if_strict(opts) \
-        ((opts)->atari ? bad_if_strict_atari : bad_if_strict_pc)
 /***** Formats an MS-DOS file name. Rejects invalid names. */
 static int msdos_format_name(const unsigned char *name, int len,
@@ -40,21 +35,20 @@ static int msdos_format_name(const unsigned char *name, int len,
                        /* Get rid of dot - test for it elsewhere */
                        name++;
                        len--;
-                } else if (!opts->atari)
+                } else
                        return -EINVAL;
        }
        /*
-         * disallow names that _really_ start with a dot for MS-DOS,
+         * disallow names that _really_ start with a dot
-         * GEMDOS does not care
         */
-        space = !opts->atari;
+        space = 1;
        c = 0;
        for (walk = res; len && walk - res < 8; walk++) {
                c = *name++;
                len--;
                if (opts->name_check != 'r' && strchr(bad_chars, c))
                        return -EINVAL;
-                if (opts->name_check == 's' && strchr(bad_if_strict(opts), c))
+                if (opts->name_check == 's' && strchr(bad_if_strict, c))
                        return -EINVAL;
                if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
                        return -EINVAL;
@@ -94,7 +88,7 @@ static int msdos_format_name(const unsigned char *name, int len,
                        if (opts->name_check != 'r' && strchr(bad_chars, c))
                                return -EINVAL;
                        if (opts->name_check == 's' &&
-                            strchr(bad_if_strict(opts), c))
+                            strchr(bad_if_strict, c))
                                return -EINVAL;
                        if (c < ' ' || c == ':' || c == '\\')
                                return -EINVAL;
@@ -243,6 +237,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
                           int is_dir, int is_hid, int cluster,
                           struct timespec *ts, struct fat_slot_info *sinfo)
 {
+        struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
        struct msdos_dir_entry de;
        __le16 time, date;
        int err;
@@ -252,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
        if (is_hid)
                de.attr |= ATTR_HIDDEN;
        de.lcase = 0;
-        fat_date_unix2dos(ts->tv_sec, &time, &date);
+        fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
        de.cdate = de.adate = 0;
        de.ctime = 0;
        de.ctime_cs = 0;
diff --git a/fs/namei.c b/fs/namei.c
index 01e67dddcc3d..4ea63ed5e791 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -31,7 +31,6 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
-#include <asm/namei.h>
 #include <asm/uaccess.h>
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -185,6 +184,8 @@ int generic_permission(struct inode *inode, int mask,
 {
        umode_t                 mode = inode->i_mode;
+        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (current->fsuid == inode->i_uid)
                mode >>= 6;
        else {
@@ -203,7 +204,7 @@ int generic_permission(struct inode *inode, int mask,
        /*
         * If the DACs are ok we don't need any capability check.
         */
-        if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
+        if ((mask & ~mode) == 0)
                return 0;
 check_capabilities:
@@ -226,13 +227,9 @@ int generic_permission(struct inode *inode, int mask,
        return -EACCES;
 }
-int permission(struct inode *inode, int mask, struct nameidata *nd)
+int inode_permission(struct inode *inode, int mask)
 {
-        int retval, submask;
+        int retval;
-        struct vfsmount *mnt = NULL;
-        if (nd)
-                mnt = nd->path.mnt;
        if (mask & MAY_WRITE) {
                umode_t mode = inode->i_mode;
@@ -251,19 +248,9 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
                        return -EACCES;
        }
-        if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
-                /*
-                 * MAY_EXEC on regular files is denied if the fs is mounted
-                 * with the "noexec" flag.
-                 */
-                if (mnt && (mnt->mnt_flags & MNT_NOEXEC))
-                        return -EACCES;
-        }
        /* Ordinary permission routines do not understand MAY_APPEND. */
-        submask = mask & ~MAY_APPEND;
        if (inode->i_op && inode->i_op->permission) {
-                retval = inode->i_op->permission(inode, submask, nd);
+                retval = inode->i_op->permission(inode, mask);
                if (!retval) {
                        /*
                         * Exec permission on a regular file is denied if none
@@ -277,7 +264,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
                                return -EACCES;
                }
        } else {
-                retval = generic_permission(inode, submask, NULL);
+                retval = generic_permission(inode, mask, NULL);
        }
        if (retval)
                return retval;
@@ -286,7 +273,8 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
        if (retval)
                return retval;
-        return security_inode_permission(inode, mask, nd);
+        return security_inode_permission(inode,
+                        mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
 }
 /**
@@ -301,7 +289,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 */
 int vfs_permission(struct nameidata *nd, int mask)
 {
-        return permission(nd->path.dentry->d_inode, mask, nd);
+        return inode_permission(nd->path.dentry->d_inode, mask);
 }
 /**
@@ -318,7 +306,7 @@ int vfs_permission(struct nameidata *nd, int mask)
 */
 int file_permission(struct file *file, int mask)
 {
-        return permission(file->f_path.dentry->d_inode, mask, NULL);
+        return inode_permission(file->f_path.dentry->d_inode, mask);
 }
 /*
@@ -459,8 +447,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
 * short-cut DAC fails, then call permission() to do more
 * complete permission check.
 */
-static int exec_permission_lite(struct inode *inode,
+static int exec_permission_lite(struct inode *inode)
-                                       struct nameidata *nd)
 {
        umode_t mode = inode->i_mode;
@@ -486,7 +473,7 @@ static int exec_permission_lite(struct inode *inode,
        return -EACCES;
 ok:
-        return security_inode_permission(inode, MAY_EXEC, nd);
+        return security_inode_permission(inode, MAY_EXEC);
 }
 /*
@@ -519,7 +506,14 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
         */
        result = d_lookup(parent, name);
        if (!result) {
-                struct dentry * dentry = d_alloc(parent, name);
+                struct dentry *dentry;
+                /* Don't create child dentry for a dead directory. */
+                result = ERR_PTR(-ENOENT);
+                if (IS_DEADDIR(dir))
+                        goto out_unlock;
+                dentry = d_alloc(parent, name);
                result = ERR_PTR(-ENOMEM);
                if (dentry) {
                        result = dir->i_op->lookup(dir, dentry, nd);
@@ -528,6 +522,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
                        else
                                result = dentry;
                }
+out_unlock:
                mutex_unlock(&dir->i_mutex);
                return result;
        }
@@ -545,27 +540,16 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
        return result;
 }
-static int __emul_lookup_dentry(const char *, struct nameidata *);
 /* SMP-safe */
-static __always_inline int
+static __always_inline void
 walk_init_root(const char *name, struct nameidata *nd)
 {
        struct fs_struct *fs = current->fs;
        read_lock(&fs->lock);
-        if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
-                nd->path = fs->altroot;
-                path_get(&fs->altroot);
-                read_unlock(&fs->lock);
-                if (__emul_lookup_dentry(name,nd))
-                        return 0;
-                read_lock(&fs->lock);
-        }
        nd->path = fs->root;
        path_get(&fs->root);
        read_unlock(&fs->lock);
-        return 1;
 }
 /*
@@ -606,12 +590,9 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
        if (*link == '/') {
                path_put(&nd->path);
-                if (!walk_init_root(link, nd))
+                walk_init_root(link, nd);
-                        /* weird __emul_prefix() stuff did it */
-                        goto out;
        }
        res = link_path_walk(link, nd);
-out:
        if (nd->depth || res || nd->last_type!=LAST_NORM)
                return res;
        /*
@@ -889,7 +870,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission_lite(inode, nd);
+                err = exec_permission_lite(inode);
                if (err == -EAGAIN)
                        err = vfs_permission(nd, MAY_EXEC);
                if (err)
@@ -1060,67 +1041,6 @@ static int path_walk(const char *name, struct nameidata *nd)
        return link_path_walk(name, nd);
 }
-/* 
- * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
- * everything is done. Returns 0 and drops input nd, if lookup failed;
- */
-static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
-{
-        if (path_walk(name, nd))
-                return 0;               /* something went wrong... */
-        if (!nd->path.dentry->d_inode ||
-            S_ISDIR(nd->path.dentry->d_inode->i_mode)) {
-                struct path old_path = nd->path;
-                struct qstr last = nd->last;
-                int last_type = nd->last_type;
-                struct fs_struct *fs = current->fs;
-                /*
-                 * NAME was not found in alternate root or it's a directory.
-                 * Try to find it in the normal root:
-                 */
-                nd->last_type = LAST_ROOT;
-                read_lock(&fs->lock);
-                nd->path = fs->root;
-                path_get(&fs->root);
-                read_unlock(&fs->lock);
-                if (path_walk(name, nd) == 0) {
-                        if (nd->path.dentry->d_inode) {
-                                path_put(&old_path);
-                                return 1;
-                        }
-                        path_put(&nd->path);
-                }
-                nd->path = old_path;
-                nd->last = last;
-                nd->last_type = last_type;
-        }
-        return 1;
-}
-void set_fs_altroot(void)
-{
-        char *emul = __emul_prefix();
-        struct nameidata nd;
-        struct path path = {}, old_path;
-        int err;
-        struct fs_struct *fs = current->fs;
-        if (!emul)
-                goto set_it;
-        err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
-        if (!err)
-                path = nd.path;
-set_it:
-        write_lock(&fs->lock);
-        old_path = fs->altroot;
-        fs->altroot = path;
-        write_unlock(&fs->lock);
-        if (old_path.dentry)
-                path_put(&old_path);
-}
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
@@ -1136,14 +1056,6 @@ static int do_path_lookup(int dfd, const char *name,
        if (*name=='/') {
                read_lock(&fs->lock);
-                if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
-                        nd->path = fs->altroot;
-                        path_get(&fs->altroot);
-                        read_unlock(&fs->lock);
-                        if (__emul_lookup_dentry(name,nd))
-                                goto out; /* found in altroot */
-                        read_lock(&fs->lock);
-                }
                nd->path = fs->root;
                path_get(&fs->root);
                read_unlock(&fs->lock);
@@ -1177,7 +1089,6 @@ static int do_path_lookup(int dfd, const char *name,
        }
        retval = path_walk(name, nd);
-out:
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
@@ -1282,19 +1193,6 @@ static int path_lookup_create(int dfd, const char *name,
                        nd, open_flags, create_mode);
 }
-int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
-                struct nameidata *nd, int open_flags)
-{
-        char *tmp = getname(name);
-        int err = PTR_ERR(tmp);
-        if (!IS_ERR(tmp)) {
-                err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
-                putname(tmp);
-        }
-        return err;
-}
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
@@ -1317,7 +1215,14 @@ static struct dentry *__lookup_hash(struct qstr *name,
        dentry = cached_lookup(base, name, nd);
        if (!dentry) {
-                struct dentry *new = d_alloc(base, name);
+                struct dentry *new;
+                /* Don't create child dentry for a dead directory. */
+                dentry = ERR_PTR(-ENOENT);
+                if (IS_DEADDIR(inode))
+                        goto out;
+                new = d_alloc(base, name);
                dentry = ERR_PTR(-ENOMEM);
                if (!new)
                        goto out;
@@ -1340,7 +1245,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 {
        int err;
-        err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
+        err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
        if (err)
                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1388,7 +1293,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = permission(base->d_inode, MAY_EXEC, NULL);
+        err = inode_permission(base->d_inode, MAY_EXEC);
        if (err)
                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
@@ -1416,22 +1321,40 @@ struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
        return __lookup_hash(&this, base, NULL);
 }
-int __user_walk_fd(int dfd, const char __user *name, unsigned flags,
+int user_path_at(int dfd, const char __user *name, unsigned flags,
-                            struct nameidata *nd)
+                 struct path *path)
 {
+        struct nameidata nd;
        char *tmp = getname(name);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
-                err = do_path_lookup(dfd, tmp, flags, nd);
+                BUG_ON(flags & LOOKUP_PARENT);
+                err = do_path_lookup(dfd, tmp, flags, &nd);
                putname(tmp);
+                if (!err)
+                        *path = nd.path;
        }
        return err;
 }
-int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+static int user_path_parent(int dfd, const char __user *path,
+                        struct nameidata *nd, char **name)
 {
-        return __user_walk_fd(AT_FDCWD, name, flags, nd);
+        char *s = getname(path);
+        int error;
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+        error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
+        if (error)
+                putname(s);
+        else
+                *name = s;
+        return error;
 }
 /*
@@ -1478,7 +1401,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
        BUG_ON(victim->d_parent->d_inode != dir);
        audit_inode_child(victim->d_name.name, victim, dir);
-        error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
+        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
@@ -1508,14 +1431,13 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 *  3. We should have write and exec permissions on dir
 *  4. We can't do it if dir is immutable (done in permission())
 */
-static inline int may_create(struct inode *dir, struct dentry *child,
+static inline int may_create(struct inode *dir, struct dentry *child)
-                             struct nameidata *nd)
 {
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
-        return permission(dir,MAY_WRITE | MAY_EXEC, nd);
+        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 /* 
@@ -1581,7 +1503,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                struct nameidata *nd)
 {
-        int error = may_create(dir, dentry, nd);
+        int error = may_create(dir, dentry);
        if (error)
                return error;
@@ -1755,7 +1677,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
        int will_write;
        int flag = open_to_namei_flags(open_flag);
-        acc_mode = ACC_MODE(flag);
+        acc_mode = MAY_OPEN | ACC_MODE(flag);
        /* O_TRUNC implies we need access checks for write permissions */
        if (flag & O_TRUNC)
@@ -2025,7 +1947,7 @@ EXPORT_SYMBOL_GPL(lookup_create);
 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-        int error = may_create(dir, dentry, NULL);
+        int error = may_create(dir, dentry);
        if (error)
                return error;
@@ -2071,20 +1993,18 @@ static int may_mknod(mode_t mode)
 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
                                unsigned dev)
 {
-        int error = 0;
+        int error;
-        char * tmp;
+        char *tmp;
-        struct dentry * dentry;
+        struct dentry *dentry;
        struct nameidata nd;
        if (S_ISDIR(mode))
                return -EPERM;
-        tmp = getname(filename);
-        if (IS_ERR(tmp))
-                return PTR_ERR(tmp);
-        error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
+        error = user_path_parent(dfd, filename, &nd, &tmp);
        if (error)
-                goto out;
+                return error;
        dentry = lookup_create(&nd, 0);
        if (IS_ERR(dentry)) {
                error = PTR_ERR(dentry);
@@ -2116,7 +2036,6 @@ out_dput:
 out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        path_put(&nd.path);
-out:
        putname(tmp);
        return error;
@@ -2129,7 +2048,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-        int error = may_create(dir, dentry, NULL);
+        int error = may_create(dir, dentry);
        if (error)
                return error;
@@ -2156,14 +2075,10 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
        struct dentry *dentry;
        struct nameidata nd;
-        tmp = getname(pathname);
+        error = user_path_parent(dfd, pathname, &nd, &tmp);
-        error = PTR_ERR(tmp);
+        if (error)
-        if (IS_ERR(tmp))
                goto out_err;
-        error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
-        if (error)
-                goto out;
        dentry = lookup_create(&nd, 1);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
@@ -2181,7 +2096,6 @@ out_dput:
 out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        path_put(&nd.path);
-out:
        putname(tmp);
 out_err:
        return error;
@@ -2259,13 +2173,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
        struct dentry *dentry;
        struct nameidata nd;
-        name = getname(pathname);
+        error = user_path_parent(dfd, pathname, &nd, &name);
-        if(IS_ERR(name))
-                return PTR_ERR(name);
-        error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
        if (error)
-                goto exit;
+                return error;
        switch(nd.last_type) {
                case LAST_DOTDOT:
@@ -2294,7 +2204,6 @@ exit2:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 exit1:
        path_put(&nd.path);
-exit:
        putname(name);
        return error;
 }
@@ -2343,19 +2252,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 */
 static long do_unlinkat(int dfd, const char __user *pathname)
 {
-        int error = 0;
+        int error;
-        char * name;
+        char *name;
        struct dentry *dentry;
        struct nameidata nd;
        struct inode *inode = NULL;
-        name = getname(pathname);
+        error = user_path_parent(dfd, pathname, &nd, &name);
-        if(IS_ERR(name))
-                return PTR_ERR(name);
-        error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
        if (error)
-                goto exit;
+                return error;
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
                goto exit1;
@@ -2382,7 +2288,6 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                iput(inode);    /* truncate the inode here */
 exit1:
        path_put(&nd.path);
-exit:
        putname(name);
        return error;
@@ -2408,9 +2313,9 @@ asmlinkage long sys_unlink(const char __user *pathname)
        return do_unlinkat(AT_FDCWD, pathname);
 }
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
-        int error = may_create(dir, dentry, NULL);
+        int error = may_create(dir, dentry);
        if (error)
                return error;
@@ -2432,23 +2337,20 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 asmlinkage long sys_symlinkat(const char __user *oldname,
                              int newdfd, const char __user *newname)
 {
-        int error = 0;
+        int error;
-        char * from;
+        char *from;
-        char * to;
+        char *to;
        struct dentry *dentry;
        struct nameidata nd;
        from = getname(oldname);
-        if(IS_ERR(from))
+        if (IS_ERR(from))
                return PTR_ERR(from);
-        to = getname(newname);
-        error = PTR_ERR(to);
-        if (IS_ERR(to))
-                goto out_putname;
-        error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+        error = user_path_parent(newdfd, newname, &nd, &to);
        if (error)
-                goto out;
+                goto out_putname;
        dentry = lookup_create(&nd, 0);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
@@ -2457,14 +2359,13 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
-        error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+        error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
 out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        path_put(&nd.path);
-out:
        putname(to);
 out_putname:
        putname(from);
@@ -2484,7 +2385,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
        if (!inode)
                return -ENOENT;
-        error = may_create(dir, new_dentry, NULL);
+        error = may_create(dir, new_dentry);
        if (error)
                return error;
@@ -2498,19 +2399,19 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return -EPERM;
        if (!dir->i_op || !dir->i_op->link)
                return -EPERM;
-        if (S_ISDIR(old_dentry->d_inode->i_mode))
+        if (S_ISDIR(inode->i_mode))
                return -EPERM;
        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;
-        mutex_lock(&old_dentry->d_inode->i_mutex);
+        mutex_lock(&inode->i_mutex);
        DQUOT_INIT(dir);
        error = dir->i_op->link(old_dentry, dir, new_dentry);
-        mutex_unlock(&old_dentry->d_inode->i_mutex);
+        mutex_unlock(&inode->i_mutex);
        if (!error)
-                fsnotify_link(dir, old_dentry->d_inode, new_dentry);
+                fsnotify_link(dir, inode, new_dentry);
        return error;
 }
@@ -2528,27 +2429,25 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
                           int flags)
 {
        struct dentry *new_dentry;
-        struct nameidata nd, old_nd;
+        struct nameidata nd;
+        struct path old_path;
        int error;
-        char * to;
+        char *to;
        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
                return -EINVAL;
-        to = getname(newname);
+        error = user_path_at(olddfd, oldname,
-        if (IS_ERR(to))
+                             flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                return PTR_ERR(to);
+                             &old_path);
-        error = __user_walk_fd(olddfd, oldname,
-                               flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                               &old_nd);
        if (error)
-                goto exit;
+                return error;
-        error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+        error = user_path_parent(newdfd, newname, &nd, &to);
        if (error)
                goto out;
        error = -EXDEV;
-        if (old_nd.path.mnt != nd.path.mnt)
+        if (old_path.mnt != nd.path.mnt)
                goto out_release;
        new_dentry = lookup_create(&nd, 0);
        error = PTR_ERR(new_dentry);
@@ -2557,7 +2456,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
-        error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+        error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(new_dentry);
@@ -2565,10 +2464,9 @@ out_unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 out_release:
        path_put(&nd.path);
-out:
-        path_put(&old_nd.path);
-exit:
        putname(to);
+out:
+        path_put(&old_path);
        return error;
 }
@@ -2621,7 +2519,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
-                error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
+                error = inode_permission(old_dentry->d_inode, MAY_WRITE);
                if (error)
                        return error;
        }
@@ -2696,7 +2594,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                return error;
        if (!new_dentry->d_inode)
-                error = may_create(new_dir, new_dentry, NULL);
+                error = may_create(new_dir, new_dentry);
        else
                error = may_delete(new_dir, new_dentry, is_dir);
        if (error)
@@ -2724,20 +2622,22 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        return error;
 }
-static int do_rename(int olddfd, const char *oldname,
+asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
-                        int newdfd, const char *newname)
+                             int newdfd, const char __user *newname)
 {
-        int error = 0;
+        struct dentry *old_dir, *new_dir;
-        struct dentry * old_dir, * new_dir;
+        struct dentry *old_dentry, *new_dentry;
-        struct dentry * old_dentry, *new_dentry;
+        struct dentry *trap;
-        struct dentry * trap;
        struct nameidata oldnd, newnd;
+        char *from;
+        char *to;
+        int error;
-        error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
+        error = user_path_parent(olddfd, oldname, &oldnd, &from);
        if (error)
                goto exit;
-        error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
+        error = user_path_parent(newdfd, newname, &newnd, &to);
        if (error)
                goto exit1;
@@ -2799,29 +2699,11 @@ exit3:
        unlock_rename(new_dir, old_dir);
 exit2:
        path_put(&newnd.path);
+        putname(to);
 exit1:
        path_put(&oldnd.path);
-exit:
-        return error;
-}
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
-                             int newdfd, const char __user *newname)
-{
-        int error;
-        char * from;
-        char * to;
-        from = getname(oldname);
-        if(IS_ERR(from))
-                return PTR_ERR(from);
-        to = getname(newname);
-        error = PTR_ERR(to);
-        if (!IS_ERR(to)) {
-                error = do_rename(olddfd, from, newdfd, to);
-                putname(to);
-        }
        putname(from);
+exit:
        return error;
 }
@@ -2959,8 +2841,7 @@ const struct inode_operations page_symlink_inode_operations = {
        .put_link       = page_put_link,
 };
-EXPORT_SYMBOL(__user_walk);
+EXPORT_SYMBOL(user_path_at);
-EXPORT_SYMBOL(__user_walk_fd);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
@@ -2975,7 +2856,7 @@ EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(permission);
+EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
diff --git a/fs/namespace.c b/fs/namespace.c
index 4f6f7635b59c..6e283c93b50d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -112,9 +112,13 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                int err;
                err = mnt_alloc_id(mnt);
-                if (err) {
+                if (err)
-                        kmem_cache_free(mnt_cache, mnt);
+                        goto out_free_cache;
-                        return NULL;
+                if (name) {
+                        mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+                        if (!mnt->mnt_devname)
+                                goto out_free_id;
                }
                atomic_set(&mnt->mnt_count, 1);
@@ -127,16 +131,14 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
                atomic_set(&mnt->__mnt_writers, 0);
-                if (name) {
-                        int size = strlen(name) + 1;
-                        char *newname = kmalloc(size, GFP_KERNEL);
-                        if (newname) {
-                                memcpy(newname, name, size);
-                                mnt->mnt_devname = newname;
-                        }
-                }
        }
        return mnt;
+out_free_id:
+        mnt_free_id(mnt);
+out_free_cache:
+        kmem_cache_free(mnt_cache, mnt);
+        return NULL;
 }
 /*
@@ -309,10 +311,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
         */
        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-                printk(KERN_DEBUG "leak detected on mount(%p) writers "
+                WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
                                "count: %d\n",
                        mnt, atomic_read(&mnt->__mnt_writers));
-                WARN_ON(1);
                /* use the flag to keep the dmesg spam down */
                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
        }
@@ -1129,27 +1130,27 @@ static int do_umount(struct vfsmount *mnt, int flags)
 asmlinkage long sys_umount(char __user * name, int flags)
 {
-        struct nameidata nd;
+        struct path path;
        int retval;
-        retval = __user_walk(name, LOOKUP_FOLLOW, &nd);
+        retval = user_path(name, &path);
        if (retval)
                goto out;
        retval = -EINVAL;
-        if (nd.path.dentry != nd.path.mnt->mnt_root)
+        if (path.dentry != path.mnt->mnt_root)
                goto dput_and_out;
-        if (!check_mnt(nd.path.mnt))
+        if (!check_mnt(path.mnt))
                goto dput_and_out;
        retval = -EPERM;
        if (!capable(CAP_SYS_ADMIN))
                goto dput_and_out;
-        retval = do_umount(nd.path.mnt, flags);
+        retval = do_umount(path.mnt, flags);
 dput_and_out:
        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
-        dput(nd.path.dentry);
+        dput(path.dentry);
-        mntput_no_expire(nd.path.mnt);
+        mntput_no_expire(path.mnt);
 out:
        return retval;
 }
@@ -1666,31 +1667,31 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
-        return do_add_mount(mnt, nd, mnt_flags, NULL);
+        return do_add_mount(mnt, &nd->path, mnt_flags, NULL);
 }
 /*
 * add a mount into a namespace's mount tree
 * - provide the option of adding the new mount to an expiration list
 */
-int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+int do_add_mount(struct vfsmount *newmnt, struct path *path,
                 int mnt_flags, struct list_head *fslist)
 {
        int err;
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
-        while (d_mountpoint(nd->path.dentry) &&
+        while (d_mountpoint(path->dentry) &&
-               follow_down(&nd->path.mnt, &nd->path.dentry))
+               follow_down(&path->mnt, &path->dentry))
                ;
        err = -EINVAL;
-        if (!check_mnt(nd->path.mnt))
+        if (!check_mnt(path->mnt))
                goto unlock;
        /* Refuse the same filesystem on the same mount point */
        err = -EBUSY;
-        if (nd->path.mnt->mnt_sb == newmnt->mnt_sb &&
+        if (path->mnt->mnt_sb == newmnt->mnt_sb &&
-            nd->path.mnt->mnt_root == nd->path.dentry)
+            path->mnt->mnt_root == path->dentry)
                goto unlock;
        err = -EINVAL;
@@ -1698,7 +1699,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
                goto unlock;
        newmnt->mnt_flags = mnt_flags;
-        if ((err = graft_tree(newmnt, &nd->path)))
+        if ((err = graft_tree(newmnt, path)))
                goto unlock;
        if (fslist) /* add to the specified expiration list */
@@ -1973,7 +1974,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                struct fs_struct *fs)
 {
        struct mnt_namespace *new_ns;
-        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
+        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct vfsmount *p, *q;
        new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
@@ -2016,10 +2017,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                                pwdmnt = p;
                                fs->pwd.mnt = mntget(q);
                        }
-                        if (p == fs->altroot.mnt) {
-                                altrootmnt = p;
-                                fs->altroot.mnt = mntget(q);
-                        }
                }
                p = next_mnt(p, mnt_ns->root);
                q = next_mnt(q, new_ns->root);
@@ -2030,8 +2027,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                mntput(rootmnt);
        if (pwdmnt)
                mntput(pwdmnt);
-        if (altrootmnt)
-                mntput(altrootmnt);
        return new_ns;
 }
@@ -2184,28 +2179,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
                               const char __user * put_old)
 {
        struct vfsmount *tmp;
-        struct nameidata new_nd, old_nd;
+        struct path new, old, parent_path, root_parent, root;
-        struct path parent_path, root_parent, root;
        int error;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+        error = user_path_dir(new_root, &new);
-                            &new_nd);
        if (error)
                goto out0;
        error = -EINVAL;
-        if (!check_mnt(new_nd.path.mnt))
+        if (!check_mnt(new.mnt))
                goto out1;
-        error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd);
+        error = user_path_dir(put_old, &old);
        if (error)
                goto out1;
-        error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
+        error = security_sb_pivotroot(&old, &new);
        if (error) {
-                path_put(&old_nd.path);
+                path_put(&old);
                goto out1;
        }
@@ -2214,69 +2207,69 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
        path_get(&current->fs->root);
        read_unlock(&current->fs->lock);
        down_write(&namespace_sem);
-        mutex_lock(&old_nd.path.dentry->d_inode->i_mutex);
+        mutex_lock(&old.dentry->d_inode->i_mutex);
        error = -EINVAL;
-        if (IS_MNT_SHARED(old_nd.path.mnt) ||
+        if (IS_MNT_SHARED(old.mnt) ||
-                IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
+                IS_MNT_SHARED(new.mnt->mnt_parent) ||
                IS_MNT_SHARED(root.mnt->mnt_parent))
                goto out2;
        if (!check_mnt(root.mnt))
                goto out2;
        error = -ENOENT;
-        if (IS_DEADDIR(new_nd.path.dentry->d_inode))
+        if (IS_DEADDIR(new.dentry->d_inode))
                goto out2;
-        if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry))
+        if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
                goto out2;
-        if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
+        if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
                goto out2;
        error = -EBUSY;
-        if (new_nd.path.mnt == root.mnt ||
+        if (new.mnt == root.mnt ||
-            old_nd.path.mnt == root.mnt)
+            old.mnt == root.mnt)
                goto out2; /* loop, on the same file system  */
        error = -EINVAL;
        if (root.mnt->mnt_root != root.dentry)
                goto out2; /* not a mountpoint */
        if (root.mnt->mnt_parent == root.mnt)
                goto out2; /* not attached */
-        if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
+        if (new.mnt->mnt_root != new.dentry)
                goto out2; /* not a mountpoint */
-        if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt)
+        if (new.mnt->mnt_parent == new.mnt)
                goto out2; /* not attached */
        /* make sure we can reach put_old from new_root */
-        tmp = old_nd.path.mnt;
+        tmp = old.mnt;
        spin_lock(&vfsmount_lock);
-        if (tmp != new_nd.path.mnt) {
+        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
                                goto out3; /* already mounted on put_old */
-                        if (tmp->mnt_parent == new_nd.path.mnt)
+                        if (tmp->mnt_parent == new.mnt)
                                break;
                        tmp = tmp->mnt_parent;
                }
-                if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry))
+                if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
                        goto out3;
-        } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
+        } else if (!is_subdir(old.dentry, new.dentry))
                goto out3;
-        detach_mnt(new_nd.path.mnt, &parent_path);
+        detach_mnt(new.mnt, &parent_path);
        detach_mnt(root.mnt, &root_parent);
        /* mount old root on put_old */
-        attach_mnt(root.mnt, &old_nd.path);
+        attach_mnt(root.mnt, &old);
        /* mount new_root on / */
-        attach_mnt(new_nd.path.mnt, &root_parent);
+        attach_mnt(new.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        spin_unlock(&vfsmount_lock);
-        chroot_fs_refs(&root, &new_nd.path);
+        chroot_fs_refs(&root, &new);
-        security_sb_post_pivotroot(&root, &new_nd.path);
+        security_sb_post_pivotroot(&root, &new);
        error = 0;
        path_put(&root_parent);
        path_put(&parent_path);
 out2:
-        mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
+        mutex_unlock(&old.dentry->d_inode->i_mutex);
        up_write(&namespace_sem);
        path_put(&root);
-        path_put(&old_nd.path);
+        path_put(&old);
 out1:
-        path_put(&new_nd.path);
+        path_put(&new);
 out0:
        return error;
 out3:
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 011ef0b6d2d4..07e9715b8658 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -266,7 +266,7 @@ leave_me:;
 static int
-__ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+__ncp_lookup_validate(struct dentry *dentry)
 {
        struct ncp_server *server;
        struct dentry *parent;
@@ -340,7 +340,7 @@ ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 {
        int res;
        lock_kernel();
-        res = __ncp_lookup_validate(dentry, nd);
+        res = __ncp_lookup_validate(dentry);
        unlock_kernel();
        return res;
 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204dec..d642f0e5b365 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
        kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 28a238dab23a..74f92b717f78 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1884,7 +1884,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
                return status;
        nfs_access_add_cache(inode, &cache);
 out:
-        if ((cache.mask & mask) == mask)
+        if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
                return 0;
        return -EACCES;
 }
@@ -1907,17 +1907,17 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
        return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
-int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int nfs_permission(struct inode *inode, int mask)
 {
        struct rpc_cred *cred;
        int res = 0;
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
-        if (mask == 0)
+        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
                goto out;
        /* Is this sys_access() ? */
-        if (nd != NULL && (nd->flags & LOOKUP_ACCESS))
+        if (mask & MAY_ACCESS)
                goto force_lookup;
        switch (inode->i_mode & S_IFMT) {
@@ -1926,8 +1926,7 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
                case S_IFREG:
                        /* NFSv4 has atomic_open... */
                        if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
-                                        && nd != NULL
+                                        && (mask & MAY_OPEN))
-                                        && (nd->flags & LOOKUP_OPEN))
                                goto out;
                        break;
                case S_IFDIR:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index df23f987da6b..52daefa2f521 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1242,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct nfs_inode *nfsi = (struct nfs_inode *) foo;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2f285ef76399..66df08dd1caf 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -129,7 +129,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                goto out_err;
        mntget(mnt);
-        err = do_add_mount(mnt, nd, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+        err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
                           &nfs_automount_list);
        if (err < 0) {
                mntput(mnt);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8478fc25daee..46763d1cd397 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -127,7 +127,7 @@ enum {
        Opt_err
 };
-static match_table_t __initconst tokens = {
+static match_table_t __initdata tokens = {
        {Opt_port, "port=%u"},
        {Opt_rsize, "rsize=%u"},
        {Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1b94e3650f5c..e9b20173fef3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,
                }
        }
+        if (errors > 0) {
+                dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+                                errors, (errors == 1 ? "" : "s"));
+                if (!sloppy)
+                        return 0;
+        }
        return 1;
 out_nomem:
@@ -1718,9 +1724,9 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
         * ones were explicitly specified. Fall back to legacy behavior and
         * just return success.
         */
-        if ((nfsvers == 4 && options4->version == 1) ||
+        if ((nfsvers == 4 && (!options4 || options4->version == 1)) ||
-            (nfsvers <= 3 && options->version >= 1 &&
+            (nfsvers <= 3 && (!options || (options->version >= 1 &&
-             options->version <= 6))
+                                           options->version <= 6))))
                return 0;
        data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3adf8b266461..f089e5839d7d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,10 +95,11 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 static void nfs_async_unlink_release(void *calldata)
 {
        struct nfs_unlinkdata   *data = calldata;
+        struct super_block *sb = data->dir->i_sb;
        nfs_dec_sillycount(data->dir);
-        nfs_sb_deactive(NFS_SERVER(data->dir));
        nfs_free_unlinkdata(data);
+        nfs_sb_deactive(NFS_SB(sb));
 }
 static const struct rpc_call_ops nfs_unlink_ops = {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 33bfcf09db46..9dc036f18356 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1023,7 +1023,7 @@ exp_export(struct nfsctl_export *nxp)
        /* Look up the dentry */
        err = path_lookup(nxp->ex_path, 0, &nd);
        if (err)
-                goto out_unlock;
+                goto out_put_clp;
        err = -EINVAL;
        exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL);
@@ -1090,9 +1090,9 @@ finish:
                exp_put(exp);
        if (fsid_key && !IS_ERR(fsid_key))
                cache_put(&fsid_key->h, &svc_expkey_cache);
-        if (clp)
-                auth_domain_put(clp);
        path_put(&nd.path);
+out_put_clp:
+        auth_domain_put(clp);
 out_unlock:
        exp_writeunlock();
 out:
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 6b6225ac4926..15c6faeec77c 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -19,6 +19,13 @@
 #define NFSDDBG_FACILITY                NFSDDBG_LOCKD
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh    nlm4_stale_fh
+#define nlm_failed      nlm4_failed
+#else
+#define nlm_stale_fh    nlm_lck_denied_nolocks
+#define nlm_failed      nlm_lck_denied_nolocks
+#endif
 /*
 * Note: we hold the dentry use count while the file is open.
 */
@@ -47,12 +54,10 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
                return 0;
        case nfserr_dropit:
                return nlm_drop_reply;
-#ifdef CONFIG_LOCKD_V4
        case nfserr_stale:
-                return nlm4_stale_fh;
+                return nlm_stale_fh;
-#endif
        default:
-                return nlm_lck_denied;
+                return nlm_failed;
        }
 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab8..54b8b4140c8f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
         * enough space for either:
         */
        alloc = sizeof(struct posix_ace_state_array)
-                + cnt*sizeof(struct posix_ace_state);
+                + cnt*sizeof(struct posix_user_ace_state);
        state->users = kzalloc(alloc, GFP_KERNEL);
        if (!state->users)
                return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index eef1629806f5..e5b51ffafc6c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -851,7 +851,7 @@ struct nfsd4_operation {
 static struct nfsd4_operation nfsd4_ops[];
-static inline char *nfsd4_op_name(unsigned opnum);
+static const char *nfsd4_op_name(unsigned opnum);
 /*
 * COMPOUND call.
@@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        int             slack_bytes;
        __be32          status;
-        status = nfserr_resource;
-        cstate = cstate_alloc();
-        if (cstate == NULL)
-                goto out;
        resp->xbuf = &rqstp->rq_res;
        resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
        resp->tagp = resp->p;
@@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
                goto out;
+        status = nfserr_resource;
+        cstate = cstate_alloc();
+        if (cstate == NULL)
+                goto out;
        status = nfs_ok;
        while (!status && resp->opcnt < args->opcnt) {
                op = &args->ops[resp->opcnt++];
@@ -957,9 +957,9 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
+        cstate_free(cstate);
 out:
        nfsd4_release_compoundargs(args);
-        cstate_free(cstate);
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
@@ -1116,8 +1116,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
        },
 };
-static inline char *
+static const char *nfsd4_op_name(unsigned opnum)
-nfsd4_op_name(unsigned opnum)
 {
        if (opnum < ARRAY_SIZE(nfsd4_ops))
                return nfsd4_ops[opnum].op_name;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1955a2702e60..c53e65f8f3a2 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/fcntl.h>
 #include <linux/net.h>
 #include <linux/in.h>
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f45451eb1e38..ea37c96f0445 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -51,7 +51,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
                /* make sure parents give x permission to user */
                int err;
                parent = dget_parent(tdentry);
-                err = permission(parent->d_inode, MAY_EXEC, NULL);
+                err = inode_permission(parent->d_inode, MAY_EXEC);
                if (err < 0) {
                        dput(parent);
                        break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0f4481e0502d..18060bed5267 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1516,7 +1516,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct dentry   *dentry, *dnew;
        __be32          err, cerr;
        int             host_err;
-        umode_t         mode;
        err = nfserr_noent;
        if (!flen || !plen)
@@ -1535,11 +1534,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dnew))
                goto out_nfserr;
-        mode = S_IALLUGO;
-        /* Only the MODE ATTRibute is even vaguely meaningful */
-        if (iap && (iap->ia_valid & ATTR_MODE))
-                mode = iap->ia_mode & S_IALLUGO;
        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
        if (host_err)
                goto out_nfserr;
@@ -1551,11 +1545,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
                else {
                        strncpy(path_alloced, path, plen);
                        path_alloced[plen] = 0;
-                        host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+                        host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
                        kfree(path_alloced);
                }
        } else
-                host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+                host_err = vfs_symlink(dentry->d_inode, dnew, path);
        if (!host_err) {
                if (EX_ISSYNC(fhp->fh_export))
@@ -1959,12 +1953,12 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                return 0;
        /* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
-        err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
+        err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
        /* Allow read access to binaries even when mode 111 */
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
-                err = permission(inode, MAY_EXEC, NULL);
+                err = inode_permission(inode, MAY_EXEC);
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 00e9ccde8e42..b38f944f0667 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1194,7 +1194,7 @@ lock_retry_remap:
                tbh = bhs[i];
                if (!tbh)
                        continue;
-                if (unlikely(test_set_buffer_locked(tbh)))
+                if (!trylock_buffer(tbh))
                        BUG();
                /* The buffer dirty state is now irrelevant, just clean it. */
                clear_buffer_dirty(tbh);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 33ff314cc507..9669541d0119 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -665,7 +665,7 @@ lock_retry_remap:
        for (i = 0; i < nr_bhs; i++) {
                struct buffer_head *tbh = bhs[i];
-                if (unlikely(test_set_buffer_locked(tbh)))
+                if (!trylock_buffer(tbh))
                        continue;
                if (unlikely(buffer_uptodate(tbh))) {
                        unlock_buffer(tbh);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3c5550cd11d6..d020866d4232 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2118,7 +2118,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
                goto out;
        if (!count)
                goto out;
-        err = remove_suid(file->f_path.dentry);
+        err = file_remove_suid(file);
        if (err)
                goto out;
        file_update_time(file);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 790defb847e7..17d32ca6bc35 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -586,7 +586,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
                for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
                        struct buffer_head *tbh = bhs[i_bhs];
-                        if (unlikely(test_set_buffer_locked(tbh)))
+                        if (!trylock_buffer(tbh))
                                BUG();
                        BUG_ON(!buffer_uptodate(tbh));
                        clear_buffer_dirty(tbh);
@@ -779,7 +779,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
        for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
                struct buffer_head *tbh = bhs[i_bhs];
-                if (unlikely(test_set_buffer_locked(tbh)))
+                if (!trylock_buffer(tbh))
                        BUG();
                BUG_ON(!buffer_uptodate(tbh));
                clear_buffer_dirty(tbh);
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e1781c8b1650..9e8a95be7a1e 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
        // TODO: Consider moving this lot to a separate function! (AIA)
 handle_name:
   {
-        struct dentry *real_dent, *new_dent;
        MFT_RECORD *m;
        ntfs_attr_search_ctx *ctx;
        ntfs_inode *ni = NTFS_I(dent_inode);
@@ -255,93 +254,9 @@ handle_name:
        }
        nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
-        /*
+        dent = d_add_ci(dent, dent_inode, &nls_name);
-         * Note: No need for dent->d_lock lock as i_mutex is held on the
-         * parent inode.
-         */
-        /* Does a dentry matching the nls_name exist already? */
-        real_dent = d_lookup(dent->d_parent, &nls_name);
-        /* If not, create it now. */
-        if (!real_dent) {
-                real_dent = d_alloc(dent->d_parent, &nls_name);
-                kfree(nls_name.name);
-                if (!real_dent) {
-                        err = -ENOMEM;
-                        goto err_out;
-                }
-                new_dent = d_splice_alias(dent_inode, real_dent);
-                if (new_dent)
-                        dput(real_dent);
-                else
-                        new_dent = real_dent;
-                ntfs_debug("Done.  (Created new dentry.)");
-                return new_dent;
-        }
        kfree(nls_name.name);
-        /* Matching dentry exists, check if it is negative. */
+        return dent;
-        if (real_dent->d_inode) {
-                if (unlikely(real_dent->d_inode != dent_inode)) {
-                        /* This can happen because bad inodes are unhashed. */
-                        BUG_ON(!is_bad_inode(dent_inode));
-                        BUG_ON(!is_bad_inode(real_dent->d_inode));
-                }
-                /*
-                 * Already have the inode and the dentry attached, decrement
-                 * the reference count to balance the ntfs_iget() we did
-                 * earlier on.  We found the dentry using d_lookup() so it
-                 * cannot be disconnected and thus we do not need to worry
-                 * about any NFS/disconnectedness issues here.
-                 */
-                iput(dent_inode);
-                ntfs_debug("Done.  (Already had inode and dentry.)");
-                return real_dent;
-        }
-        /*
-         * Negative dentry: instantiate it unless the inode is a directory and
-         * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
-         * in which case d_move() that in place of the found dentry.
-         */
-        if (!S_ISDIR(dent_inode->i_mode)) {
-                /* Not a directory; everything is easy. */
-                d_instantiate(real_dent, dent_inode);
-                ntfs_debug("Done.  (Already had negative file dentry.)");
-                return real_dent;
-        }
-        spin_lock(&dcache_lock);
-        if (list_empty(&dent_inode->i_dentry)) {
-                /*
-                 * Directory without a 'disconnected' dentry; we need to do
-                 * d_instantiate() by hand because it takes dcache_lock which
-                 * we already hold.
-                 */
-                list_add(&real_dent->d_alias, &dent_inode->i_dentry);
-                real_dent->d_inode = dent_inode;
-                spin_unlock(&dcache_lock);
-                security_d_instantiate(real_dent, dent_inode);
-                ntfs_debug("Done.  (Already had negative directory dentry.)");
-                return real_dent;
-        }
-        /*
-         * Directory with a 'disconnected' dentry; get a reference to the
-         * 'disconnected' dentry.
-         */
-        new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
-                        d_alias);
-        dget_locked(new_dent);
-        spin_unlock(&dcache_lock);
-        /* Do security vodoo. */
-        security_d_instantiate(real_dent, dent_inode);
-        /* Move new_dent in place of real_dent. */
-        d_move(new_dent, real_dent);
-        /* Balance the ntfs_iget() we did above. */
-        iput(dent_inode);
-        /* Throw away real_dent. */
-        dput(real_dent);
-        /* Use new_dent as the actual dentry. */
-        ntfs_debug("Done.  (Already had negative, disconnected directory "
-                        "dentry.)");
-        return new_dent;
 eio_err_out:
        ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216bc..4a46743b5077 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
 struct kmem_cache *ntfs_big_inode_cache;
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
 {
        ntfs_inode *ni = (ntfs_inode *)foo;
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
 * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
 * file since it was last opened.  I think the names speak for themselves but
 * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * documentation: http://www.linux-ntfs.org/
 */
 enum {
        USN_REASON_DATA_OVERWRITE       = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
 * Source info flags (32-bit).  Information about the source of the change(s)
 * to the file.  For detailed descriptions of what these mean, see the Linux
 * NTFS project NTFS documentation:
- *      http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ *      http://www.linux-ntfs.org/
 */
 enum {
        USN_SOURCE_DATA_MANAGEMENT        = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1db080135c6d..a53da1466277 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
                ocfs2_error(inode->i_sb,
                            "Inode %llu has a hole at block %llu\n",
                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1073,12 +1073,15 @@ static void ocfs2_write_failure(struct inode *inode,
        for(i = 0; i < wc->w_num_pages; i++) {
                tmppage = wc->w_pages[i];
-                if (ocfs2_should_order_data(inode))
+                if (page_has_buffers(tmppage)) {
-                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                        if (ocfs2_should_order_data(inode))
-                                          from, to, NULL,
+                                walk_page_buffers(wc->w_handle,
-                                          ocfs2_journal_dirty_data);
+                                                  page_buffers(tmppage),
+                                                  from, to, NULL,
+                                                  ocfs2_journal_dirty_data);
-                block_commit_write(tmppage, from, to);
+                        block_commit_write(tmppage, from, to);
+                }
        }
 }
@@ -1901,12 +1904,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                        to = PAGE_CACHE_SIZE;
                }
-                if (ocfs2_should_order_data(inode))
+                if (page_has_buffers(tmppage)) {
-                        walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+                        if (ocfs2_should_order_data(inode))
-                                          from, to, NULL,
+                                walk_page_buffers(wc->w_handle,
-                                          ocfs2_journal_dirty_data);
+                                                  page_buffers(tmppage),
+                                                  from, to, NULL,
-                block_commit_write(tmppage, from, to);
+                                                  ocfs2_journal_dirty_data);
+                        block_commit_write(tmppage, from, to);
+                }
        }
 out_write_size:
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index d8bfa0eb41b2..52276c02f710 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
                           "  message id:   %d\n"
                           "  message type: %u\n"
                           "  message key:  0x%08x\n"
-                           "  sock acquiry: %lu.%lu\n"
+                           "  sock acquiry: %lu.%ld\n"
-                           "  send start:   %lu.%lu\n"
+                           "  send start:   %lu.%ld\n"
-                           "  wait start:   %lu.%lu\n",
+                           "  wait start:   %lu.%ld\n",
                           nst, (unsigned long)nst->st_task->pid,
                           (unsigned long)nst->st_task->tgid,
                           nst->st_task->comm, nst->st_node,
                           nst->st_sc, nst->st_id, nst->st_msg_type,
                           nst->st_msg_key,
                           nst->st_sock_time.tv_sec,
-                           (unsigned long)nst->st_sock_time.tv_usec,
+                           (long)nst->st_sock_time.tv_usec,
                           nst->st_send_time.tv_sec,
-                           (unsigned long)nst->st_send_time.tv_usec,
+                           (long)nst->st_send_time.tv_usec,
                           nst->st_status_time.tv_sec,
-                           nst->st_status_time.tv_usec);
+                           (long)nst->st_status_time.tv_usec);
        }
        spin_unlock(&o2net_debug_lock);
@@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        return sc; /* unused, just needs to be null when done */
 }
-#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
@@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                           "  remote node:     %s\n"
                           "  page off:        %zu\n"
                           "  handshake ok:    %u\n"
-                           "  timer:           %lu.%lu\n"
+                           "  timer:           %lu.%ld\n"
-                           "  data ready:      %lu.%lu\n"
+                           "  data ready:      %lu.%ld\n"
-                           "  advance start:   %lu.%lu\n"
+                           "  advance start:   %lu.%ld\n"
-                           "  advance stop:    %lu.%lu\n"
+                           "  advance stop:    %lu.%ld\n"
-                           "  func start:      %lu.%lu\n"
+                           "  func start:      %lu.%ld\n"
-                           "  func stop:       %lu.%lu\n"
+                           "  func stop:       %lu.%ld\n"
                           "  func key:        %u\n"
                           "  func type:       %u\n",
                           sc,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd6..2bcf706d9dd3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 #ifdef CONFIG_DEBUG_FS
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                    u32 msgkey, struct task_struct *task, u8 node)
+                           u32 msgkey, struct task_struct *task, u8 node)
 {
        INIT_LIST_HEAD(&nst->st_net_debug_item);
        nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
        nst->st_node = node;
 }
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
        do_gettimeofday(&nst->st_sock_time);
 }
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
        do_gettimeofday(&nst->st_send_time);
 }
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
        do_gettimeofday(&nst->st_status_time);
 }
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
                                         struct o2net_sock_container *sc)
 {
        nst->st_sc = sc;
 }
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
        nst->st_id = msg_id;
 }
+#else  /* CONFIG_DEBUG_FS */
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+                                  u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+                                                struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+                                        u32 msg_id)
+{
+}
 #endif /* CONFIG_DEBUG_FS */
 static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b77..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
        struct timeval                  st_send_time;
        struct timeval                  st_status_time;
 };
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                    u32 msgkey, struct task_struct *task, u8 node);
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                  struct o2net_sock_container *sc);
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
 #else
 struct o2net_send_tracking {
        u32     dummy;
 };
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-                                  u32 msgkey, struct task_struct *task, u8 node)
-{
-}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-                                                struct o2net_sock_container *sc)
-{
-}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
-                                        u32 msg_id)
-{
-}
 #endif  /* CONFIG_DEBUG_FS */
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a1875848080..9cce563fd627 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1300,7 +1300,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        di->i_size = cpu_to_le64(sb->s_blocksize);
        di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
-        dir->i_blocks = ocfs2_inode_sector_count(dir);
        /*
         * This should never fail as our extent list is empty and all
@@ -1310,9 +1309,15 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                  NULL);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                goto out_commit;
        }
+        /*
+         * Set i_blocks after the extent insert for the most up to
+         * date ip_clusters value.
+         */
+        dir->i_blocks = ocfs2_inode_sector_count(dir);
        ret = ocfs2_journal_dirty(handle, di_bh);
        if (ret) {
                mlog_errno(ret);
@@ -1336,7 +1341,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                          len, 0, NULL);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        goto out_commit;
                }
        }
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b77..533a789c3ef8 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
        return writelen;
 }
-static void dlmfs_init_once(struct kmem_cache *cachep,
+static void dlmfs_init_once(void *foo)
-                            void *foo)
 {
        struct dlmfs_inode_private *ip =
                (struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e8514e8b6ce8..ec2ed15c3daa 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1176,7 +1176,7 @@ bail:
        return err;
 }
-int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int ocfs2_permission(struct inode *inode, int mask)
 {
        int ret;
@@ -1766,8 +1766,8 @@ out_inode_unlock:
 out_rw_unlock:
        ocfs2_rw_unlock(inode, 1);
-        mutex_unlock(&inode->i_mutex);
 out:
+        mutex_unlock(&inode->i_mutex);
        return ret;
 }
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 048ddcaf5c80..1e27b4d017ea 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -62,8 +62,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask,
+int ocfs2_permission(struct inode *inode, int mask);
-                     struct nameidata *nd);
 int ocfs2_should_update_atime(struct inode *inode,
                              struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a8c19cb3cfdd..c47bc2a809c2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty);
+                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -562,8 +562,18 @@ done:
        return status;
 }
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+        le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+        return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-                                      int dirty)
+                                      int dirty, int replayed)
 {
        int status;
        unsigned int flags;
@@ -593,6 +603,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        if (replayed)
+                ocfs2_bump_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
@@ -667,7 +680,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
                 * Do not toggle if flush was unsuccessful otherwise
                 * will leave dirty metadata in a "clean" journal
                 */
-                status = ocfs2_journal_toggle_dirty(osb, 0);
+                status = ocfs2_journal_toggle_dirty(osb, 0, 0);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -710,7 +723,7 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
        }
 }
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 {
        int status = 0;
        struct ocfs2_super *osb;
@@ -729,7 +742,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
        ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
-        status = ocfs2_journal_toggle_dirty(osb, 1);
+        status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
        if (status < 0) {
                mlog_errno(status);
                goto done;
@@ -771,7 +784,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
                goto bail;
        }
-        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+        status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
        if (status < 0)
                mlog_errno(status);
@@ -1034,6 +1047,12 @@ restart:
        spin_unlock(&osb->osb_lock);
        mlog(0, "All nodes recovered\n");
+        /* Refresh all journal recovery generations from disk */
+        status = ocfs2_check_journals_nolocks(osb);
+        status = (status == -EROFS) ? 0 : status;
+        if (status < 0)
+                mlog_errno(status);
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1115,42 @@ out:
        mlog_exit_void();
 }
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+                                    int slot_num,
+                                    struct buffer_head **bh,
+                                    struct inode **ret_inode)
+{
+        int status = -EACCES;
+        struct inode *inode = NULL;
+        BUG_ON(slot_num >= osb->max_slots);
+        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+                                            slot_num);
+        if (!inode || is_bad_inode(inode)) {
+                mlog_errno(status);
+                goto bail;
+        }
+        SET_INODE_JOURNAL(inode);
+        status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = 0;
+bail:
+        if (inode) {
+                if (status || !ret_inode)
+                        iput(inode);
+                else
+                        *ret_inode = inode;
+        }
+        return status;
+}
 /* Does the actual journal replay and marks the journal inode as
 * clean. Will only replay if the journal inode is marked dirty. */
 static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1164,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        struct ocfs2_dinode *fe;
        journal_t *journal = NULL;
        struct buffer_head *bh = NULL;
+        u32 slot_reco_gen;
-        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+        status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
-                                            slot_num);
+        if (status) {
-        if (inode == NULL) {
-                status = -EACCES;
                mlog_errno(status);
                goto done;
        }
-        if (is_bad_inode(inode)) {
-                status = -EACCES;
+        fe = (struct ocfs2_dinode *)bh->b_data;
-                iput(inode);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
-                inode = NULL;
+        brelse(bh);
-                mlog_errno(status);
+        bh = NULL;
+        /*
+         * As the fs recovery is asynchronous, there is a small chance that
+         * another node mounted (and recovered) the slot before the recovery
+         * thread could get the lock. To handle that, we dirty read the journal
+         * inode for that slot to get the recovery generation. If it is
+         * different than what we expected, the slot has been recovered.
+         * If not, it needs recovery.
+         */
+        if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+                mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+                     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+                status = -EBUSY;
                goto done;
        }
-        SET_INODE_JOURNAL(inode);
+        /* Continue with recovery as the journal has not yet been recovered */
        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
@@ -1138,9 +1207,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        fe = (struct ocfs2_dinode *) bh->b_data;
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+        slot_reco_gen = ocfs2_get_recovery_generation(fe);
        if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
                mlog(0, "No recovery required for node %d\n", node_num);
+                /* Refresh recovery generation for the slot */
+                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
                goto done;
        }
@@ -1188,6 +1260,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        flags &= ~OCFS2_JOURNAL_DIRTY_FL;
        fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+        /* Increment recovery generation to indicate successful recovery */
+        ocfs2_bump_recovery_generation(fe);
+        osb->slot_recovery_generations[slot_num] =
+                                        ocfs2_get_recovery_generation(fe);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1252,6 +1329,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
+                if (status == -EBUSY) {
+                        mlog(0, "Skipping recovery for slot %u (node %u) "
+                             "as another node has recovered it\n", slot_num,
+                             node_num);
+                        status = 0;
+                        goto done;
+                }
                mlog_errno(status);
                goto done;
        }
@@ -1334,21 +1418,46 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
        unsigned int node_num;
        int status, i;
+        u32 gen;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_dinode *di;
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
-        spin_lock(&osb->osb_lock);
        for (i = 0; i < osb->max_slots; i++) {
-                if (i == osb->slot_num)
+                /* Read journal inode to get the recovery generation */
+                status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                di = (struct ocfs2_dinode *)bh->b_data;
+                gen = ocfs2_get_recovery_generation(di);
+                brelse(bh);
+                bh = NULL;
+                spin_lock(&osb->osb_lock);
+                osb->slot_recovery_generations[i] = gen;
+                mlog(0, "Slot %u recovery generation is %u\n", i,
+                     osb->slot_recovery_generations[i]);
+                if (i == osb->slot_num) {
+                        spin_unlock(&osb->osb_lock);
                        continue;
+                }
                status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
-                if (status == -ENOENT)
+                if (status == -ENOENT) {
+                        spin_unlock(&osb->osb_lock);
                        continue;
+                }
-                if (__ocfs2_recovery_map_test(osb, node_num))
+                if (__ocfs2_recovery_map_test(osb, node_num)) {
+                        spin_unlock(&osb->osb_lock);
                        continue;
+                }
                spin_unlock(&osb->osb_lock);
                /* Ok, we have a slot occupied by another node which
@@ -1364,10 +1473,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                        mlog_errno(status);
                        goto bail;
                }
-                spin_lock(&osb->osb_lock);
        }
-        spin_unlock(&osb->osb_lock);
        status = 0;
 bail:
@@ -1603,49 +1709,41 @@ static int ocfs2_commit_thread(void *arg)
        return 0;
 }
-/* Look for a dirty journal without taking any cluster locks. Used for
+/* Reads all the journal inodes without taking any cluster locks. Used
- * hard readonly access to determine whether the file system journals
+ * for hard readonly access to determine whether any journal requires
- * require recovery. */
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
 {
        int ret = 0;
        unsigned int slot;
-        struct buffer_head *di_bh;
+        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        struct inode *journal = NULL;
+        int journal_dirty = 0;
        for(slot = 0; slot < osb->max_slots; slot++) {
-                journal = ocfs2_get_system_file_inode(osb,
+                ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
-                                                      JOURNAL_SYSTEM_INODE,
+                if (ret) {
-                                                      slot);
-                if (!journal || is_bad_inode(journal)) {
-                        ret = -EACCES;
-                        mlog_errno(ret);
-                        goto out;
-                }
-                di_bh = NULL;
-                ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
-                                       0, journal);
-                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
                }
                di = (struct ocfs2_dinode *) di_bh->b_data;
+                osb->slot_recovery_generations[slot] =
+                                        ocfs2_get_recovery_generation(di);
                if (le32_to_cpu(di->id1.journal1.ij_flags) &
                    OCFS2_JOURNAL_DIRTY_FL)
-                        ret = -EROFS;
+                        journal_dirty = 1;
                brelse(di_bh);
-                if (ret)
+                di_bh = NULL;
-                        break;
        }
 out:
-        if (journal)
+        if (journal_dirty)
-                iput(journal);
+                ret = -EROFS;
        return ret;
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532ed..2178ebffa05f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -161,7 +161,8 @@ int    ocfs2_journal_init(struct ocfs2_journal *journal,
 void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
 int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
                          int full);
-int    ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+                          int replayed);
 int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
 void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1cb814be8ef1..7f625f2b1117 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -204,6 +204,8 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
+        u32 *slot_recovery_generations;
        spinlock_t node_map_lock;
        u64 root_blkno;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3f1945177629..4f619850ccf7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -660,7 +660,10 @@ struct ocfs2_dinode {
                struct {                /* Info for journal system
                                           inodes */
                        __le32 ij_flags;        /* Mounted, version, etc. */
-                        __le32 ij_pad;
+                        __le32 ij_recovery_generation; /* Incremented when the
+                                                          journal is recovered
+                                                          after an unclean
+                                                          shutdown */
                } journal1;
        } id1;                          /* Inode type dependant 1 */
 /*C0*/  union {
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3a..07f348b8d721 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
                goto out;
        }
-        /* Ok, the stack is pinned */
-        p->sp_count++;
        active_stack = p;
        rc = 0;
 out:
+        /* If we found it, pin it */
+        if (!rc)
+                active_stack->sp_count++;
        spin_unlock(&ocfs2_stack_lock);
        return rc;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccecfe5094fa..88255d3f52b4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1118,7 +1118,7 @@ bail:
        return status;
 }
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
 {
        struct ocfs2_inode_info *oi = data;
@@ -1442,6 +1442,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+        osb->slot_recovery_generations =
+                kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+                        GFP_KERNEL);
+        if (!osb->slot_recovery_generations) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
        init_waitqueue_head(&osb->osb_wipe_event);
        osb->osb_orphan_wipes = kcalloc(osb->max_slots,
                                        sizeof(*osb->osb_orphan_wipes),
@@ -1703,7 +1712,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
        local = ocfs2_mount_local(osb);
        /* will play back anything left in the journal. */
-        status = ocfs2_journal_load(osb->journal, local);
+        status = ocfs2_journal_load(osb->journal, local, dirty);
        if (status < 0) {
                mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
                goto finally;
@@ -1768,6 +1777,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        ocfs2_free_slot_info(osb);
        kfree(osb->osb_orphan_wipes);
+        kfree(osb->slot_recovery_generations);
        /* FIXME
         * This belongs in journal shutdown, but because we have to
         * allocate osb->journal at the start of ocfs2_initalize_osb(),
diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile
new file mode 100644
index 000000000000..8b82b63f1129
--- /dev/null
+++ b/fs/omfs/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_OMFS_FS) += omfs.o
+omfs-y := bitmap.o dir.o file.o inode.o
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
new file mode 100644
index 000000000000..e1c0ec0ae989
--- /dev/null
+++ b/fs/omfs/bitmap.c
@@ -0,0 +1,193 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "omfs.h"
+unsigned long omfs_count_free(struct super_block *sb)
+{
+        unsigned int i;
+        unsigned long sum = 0;
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        int nbits = sb->s_blocksize * 8;
+        for (i = 0; i < sbi->s_imap_size; i++)
+                sum += nbits - bitmap_weight(sbi->s_imap[i], nbits);
+        return sum;
+}
+/*
+ *  Counts the run of zero bits starting at bit up to max.
+ *  It handles the case where a run might spill over a buffer.
+ *  Called with bitmap lock.
+ */
+static int count_run(unsigned long **addr, int nbits,
+                int addrlen, int bit, int max)
+{
+        int count = 0;
+        int x;
+        for (; addrlen > 0; addrlen--, addr++) {
+                x = find_next_bit(*addr, nbits, bit);
+                count += x - bit;
+                if (x < nbits || count > max)
+                        return min(count, max);
+                bit = 0;
+        }
+        return min(count, max);
+}
+/*
+ * Sets or clears the run of count bits starting with bit.
+ * Called with bitmap lock.
+ */
+static int set_run(struct super_block *sb, int map,
+                int nbits, int bit, int count, int set)
+{
+        int i;
+        int err;
+        struct buffer_head *bh;
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        err = -ENOMEM;
+        bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+        if (!bh)
+                goto out;
+        for (i = 0; i < count; i++, bit++) {
+                if (bit >= nbits) {
+                        bit = 0;
+                        map++;
+                        mark_buffer_dirty(bh);
+                        brelse(bh);
+                        bh = sb_bread(sb,
+                                clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+                        if (!bh)
+                                goto out;
+                }
+                if (set) {
+                        set_bit(bit, sbi->s_imap[map]);
+                        set_bit(bit, (unsigned long *)bh->b_data);
+                } else {
+                        clear_bit(bit, sbi->s_imap[map]);
+                        clear_bit(bit, (unsigned long *)bh->b_data);
+                }
+        }
+        mark_buffer_dirty(bh);
+        brelse(bh);
+        err = 0;
+out:
+        return err;
+}
+/*
+ * Tries to allocate exactly one block.  Returns true if sucessful.
+ */
+int omfs_allocate_block(struct super_block *sb, u64 block)
+{
+        struct buffer_head *bh;
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        int bits_per_entry = 8 * sb->s_blocksize;
+        unsigned int map, bit;
+        int ret = 0;
+        u64 tmp;
+        tmp = block;
+        bit = do_div(tmp, bits_per_entry);
+        map = tmp;
+        mutex_lock(&sbi->s_bitmap_lock);
+        if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map]))
+                goto out;
+        if (sbi->s_bitmap_ino > 0) {
+                bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+                if (!bh)
+                        goto out;
+                set_bit(bit, (unsigned long *)bh->b_data);
+                mark_buffer_dirty(bh);
+                brelse(bh);
+        }
+        ret = 1;
+out:
+        mutex_unlock(&sbi->s_bitmap_lock);
+        return ret;
+}
+/*
+ *  Tries to allocate a set of blocks.  The request size depends on the
+ *  type: for inodes, we must allocate sbi->s_mirrors blocks, and for file
+ *  blocks, we try to allocate sbi->s_clustersize, but can always get away
+ *  with just one block.
+ */
+int omfs_allocate_range(struct super_block *sb,
+                        int min_request,
+                        int max_request,
+                        u64 *return_block,
+                        int *return_size)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        int bits_per_entry = 8 * sb->s_blocksize;
+        int ret = 0;
+        int i, run, bit;
+        mutex_lock(&sbi->s_bitmap_lock);
+        for (i = 0; i < sbi->s_imap_size; i++) {
+                bit = 0;
+                while (bit < bits_per_entry) {
+                        bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry,
+                                bit);
+                        if (bit == bits_per_entry)
+                                break;
+                        run = count_run(&sbi->s_imap[i], bits_per_entry,
+                                sbi->s_imap_size-i, bit, max_request);
+                        if (run >= min_request)
+                                goto found;
+                        bit += run;
+                }
+        }
+        ret = -ENOSPC;
+        goto out;
+found:
+        *return_block = i * bits_per_entry + bit;
+        *return_size = run;
+        ret = set_run(sb, i, bits_per_entry, bit, run, 1);
+out:
+        mutex_unlock(&sbi->s_bitmap_lock);
+        return ret;
+}
+/*
+ * Clears count bits starting at a given block.
+ */
+int omfs_clear_range(struct super_block *sb, u64 block, int count)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        int bits_per_entry = 8 * sb->s_blocksize;
+        u64 tmp;
+        unsigned int map, bit;
+        int ret;
+        tmp = block;
+        bit = do_div(tmp, bits_per_entry);
+        map = tmp;
+        if (map >= sbi->s_imap_size)
+                return 0;
+        mutex_lock(&sbi->s_bitmap_lock);
+        ret = set_run(sb, map, bits_per_entry, bit, count, 0);
+        mutex_unlock(&sbi->s_bitmap_lock);
+        return ret;
+}
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
new file mode 100644
index 000000000000..c0757e998876
--- /dev/null
+++ b/fs/omfs/dir.c
@@ -0,0 +1,504 @@
+/*
+ * OMFS (as used by RIO Karma) directory operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/buffer_head.h>
+#include "omfs.h"
+static int omfs_hash(const char *name, int namelen, int mod)
+{
+        int i, hash = 0;
+        for (i = 0; i < namelen; i++)
+                hash ^= tolower(name[i]) << (i % 24);
+        return hash % mod;
+}
+/*
+ * Finds the bucket for a given name and reads the containing block;
+ * *ofs is set to the offset of the first list entry.
+ */
+static struct buffer_head *omfs_get_bucket(struct inode *dir,
+                const char *name, int namelen, int *ofs)
+{
+        int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
+        int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
+        int bucket = omfs_hash(name, namelen, nbuckets);
+        *ofs = OMFS_DIR_START + bucket * 8;
+        return sb_bread(dir->i_sb, block);
+}
+static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
+                                const char *name, int namelen,
+                                u64 *prev_block)
+{
+        struct buffer_head *bh;
+        struct omfs_inode *oi;
+        int err = -ENOENT;
+        *prev_block = ~0;
+        while (block != ~0) {
+                bh = sb_bread(dir->i_sb,
+                        clus_to_blk(OMFS_SB(dir->i_sb), block));
+                if (!bh) {
+                        err = -EIO;
+                        goto err;
+                }
+                oi = (struct omfs_inode *) bh->b_data;
+                if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) {
+                        brelse(bh);
+                        goto err;
+                }
+                if (strncmp(oi->i_name, name, namelen) == 0)
+                        return bh;
+                *prev_block = block;
+                block = be64_to_cpu(oi->i_sibling);
+                brelse(bh);
+        }
+err:
+        return ERR_PTR(err);
+}
+static struct buffer_head *omfs_find_entry(struct inode *dir,
+                                           const char *name, int namelen)
+{
+        struct buffer_head *bh;
+        int ofs;
+        u64 block, dummy;
+        bh = omfs_get_bucket(dir, name, namelen, &ofs);
+        if (!bh)
+                return ERR_PTR(-EIO);
+        block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs]));
+        brelse(bh);
+        return omfs_scan_list(dir, block, name, namelen, &dummy);
+}
+int omfs_make_empty(struct inode *inode, struct super_block *sb)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        int block = clus_to_blk(sbi, inode->i_ino);
+        struct buffer_head *bh;
+        struct omfs_inode *oi;
+        bh = sb_bread(sb, block);
+        if (!bh)
+                return -ENOMEM;
+        memset(bh->b_data, 0, sizeof(struct omfs_inode));
+        if (inode->i_mode & S_IFDIR) {
+                memset(&bh->b_data[OMFS_DIR_START], 0xff,
+                        sbi->s_sys_blocksize - OMFS_DIR_START);
+        } else
+                omfs_make_empty_table(bh, OMFS_EXTENT_START);
+        oi = (struct omfs_inode *) bh->b_data;
+        oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+        oi->i_sibling = ~cpu_to_be64(0ULL);
+        mark_buffer_dirty(bh);
+        brelse(bh);
+        return 0;
+}
+static int omfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        struct omfs_inode *oi;
+        struct buffer_head *bh;
+        u64 block;
+        __be64 *entry;
+        int ofs;
+        /* just prepend to head of queue in proper bucket */
+        bh = omfs_get_bucket(dir, name, namelen, &ofs);
+        if (!bh)
+                goto out;
+        entry = (__be64 *) &bh->b_data[ofs];
+        block = be64_to_cpu(*entry);
+        *entry = cpu_to_be64(inode->i_ino);
+        mark_buffer_dirty(bh);
+        brelse(bh);
+        /* now set the sibling and parent pointers on the new inode */
+        bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino));
+        if (!bh)
+                goto out;
+        oi = (struct omfs_inode *) bh->b_data;
+        memcpy(oi->i_name, name, namelen);
+        memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen);
+        oi->i_sibling = cpu_to_be64(block);
+        oi->i_parent = cpu_to_be64(dir->i_ino);
+        mark_buffer_dirty(bh);
+        brelse(bh);
+        dir->i_ctime = CURRENT_TIME_SEC;
+        /* mark affected inodes dirty to rebuild checksums */
+        mark_inode_dirty(dir);
+        mark_inode_dirty(inode);
+        return 0;
+out:
+        return -ENOMEM;
+}
+static int omfs_delete_entry(struct dentry *dentry)
+{
+        struct inode *dir = dentry->d_parent->d_inode;
+        struct inode *dirty;
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        struct omfs_inode *oi;
+        struct buffer_head *bh, *bh2;
+        __be64 *entry, next;
+        u64 block, prev;
+        int ofs;
+        int err = -ENOMEM;
+        /* delete the proper node in the bucket's linked list */
+        bh = omfs_get_bucket(dir, name, namelen, &ofs);
+        if (!bh)
+                goto out;
+        entry = (__be64 *) &bh->b_data[ofs];
+        block = be64_to_cpu(*entry);
+        bh2 = omfs_scan_list(dir, block, name, namelen, &prev);
+        if (IS_ERR(bh2)) {
+                err = PTR_ERR(bh2);
+                goto out_free_bh;
+        }
+        oi = (struct omfs_inode *) bh2->b_data;
+        next = oi->i_sibling;
+        brelse(bh2);
+        if (prev != ~0) {
+                /* found in middle of list, get list ptr */
+                brelse(bh);
+                bh = sb_bread(dir->i_sb,
+                        clus_to_blk(OMFS_SB(dir->i_sb), prev));
+                if (!bh)
+                        goto out;
+                oi = (struct omfs_inode *) bh->b_data;
+                entry = &oi->i_sibling;
+        }
+        *entry = next;
+        mark_buffer_dirty(bh);
+        if (prev != ~0) {
+                dirty = omfs_iget(dir->i_sb, prev);
+                if (!IS_ERR(dirty)) {
+                        mark_inode_dirty(dirty);
+                        iput(dirty);
+                }
+        }
+        err = 0;
+out_free_bh:
+        brelse(bh);
+out:
+        return err;
+}
+static int omfs_dir_is_empty(struct inode *inode)
+{
+        int nbuckets = (inode->i_size - OMFS_DIR_START) / 8;
+        struct buffer_head *bh;
+        u64 *ptr;
+        int i;
+        bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb),
+                        inode->i_ino));
+        if (!bh)
+                return 0;
+        ptr = (u64 *) &bh->b_data[OMFS_DIR_START];
+        for (i = 0; i < nbuckets; i++, ptr++)
+                if (*ptr != ~0)
+                        break;
+        brelse(bh);
+        return *ptr != ~0;
+}
+static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        int ret;
+        struct inode *inode = dentry->d_inode;
+        ret = omfs_delete_entry(dentry);
+        if (ret)
+                goto end_unlink;
+        inode_dec_link_count(inode);
+        mark_inode_dirty(dir);
+end_unlink:
+        return ret;
+}
+static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        int err = -ENOTEMPTY;
+        struct inode *inode = dentry->d_inode;
+        if (omfs_dir_is_empty(inode)) {
+                err = omfs_unlink(dir, dentry);
+                if (!err)
+                        inode_dec_link_count(inode);
+        }
+        return err;
+}
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
+{
+        int err;
+        struct inode *inode = omfs_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        err = omfs_make_empty(inode, dir->i_sb);
+        if (err)
+                goto out_free_inode;
+        err = omfs_add_link(dentry, inode);
+        if (err)
+                goto out_free_inode;
+        d_instantiate(dentry, inode);
+        return 0;
+out_free_inode:
+        iput(inode);
+        return err;
+}
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        return omfs_add_node(dir, dentry, mode | S_IFDIR);
+}
+static int omfs_create(struct inode *dir, struct dentry *dentry, int mode,
+                struct nameidata *nd)
+{
+        return omfs_add_node(dir, dentry, mode | S_IFREG);
+}
+static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+        struct buffer_head *bh;
+        struct inode *inode = NULL;
+        if (dentry->d_name.len > OMFS_NAMELEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+        if (!IS_ERR(bh)) {
+                struct omfs_inode *oi = (struct omfs_inode *)bh->b_data;
+                ino_t ino = be64_to_cpu(oi->i_head.h_self);
+                brelse(bh);
+                inode = omfs_iget(dir->i_sb, ino);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        d_add(dentry, inode);
+        return NULL;
+}
+/* sanity check block's self pointer */
+int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+        u64 fsblock)
+{
+        int is_bad;
+        u64 ino = be64_to_cpu(header->h_self);
+        is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) ||
+                (ino > sbi->s_num_blocks));
+        if (is_bad)
+                printk(KERN_WARNING "omfs: bad hash chain detected\n");
+        return is_bad;
+}
+static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+                u64 fsblock, int hindex)
+{
+        struct inode *dir = filp->f_dentry->d_inode;
+        struct buffer_head *bh;
+        struct omfs_inode *oi;
+        u64 self;
+        int res = 0;
+        unsigned char d_type;
+        /* follow chain in this bucket */
+        while (fsblock != ~0) {
+                bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb),
+                                fsblock));
+                if (!bh)
+                        goto out;
+                oi = (struct omfs_inode *) bh->b_data;
+                if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
+                        brelse(bh);
+                        goto out;
+                }
+                self = fsblock;
+                fsblock = be64_to_cpu(oi->i_sibling);
+                /* skip visited nodes */
+                if (hindex) {
+                        hindex--;
+                        brelse(bh);
+                        continue;
+                }
+                d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
+                res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+                        OMFS_NAMELEN), filp->f_pos, self, d_type);
+                if (res == 0)
+                        filp->f_pos++;
+                brelse(bh);
+        }
+out:
+        return res;
+}
+static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct inode *new_inode = new_dentry->d_inode;
+        struct inode *old_inode = old_dentry->d_inode;
+        struct buffer_head *bh;
+        int is_dir;
+        int err;
+        is_dir = S_ISDIR(old_inode->i_mode);
+        if (new_inode) {
+                /* overwriting existing file/dir */
+                err = -ENOTEMPTY;
+                if (is_dir && !omfs_dir_is_empty(new_inode))
+                        goto out;
+                err = -ENOENT;
+                bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
+                        new_dentry->d_name.len);
+                if (IS_ERR(bh))
+                        goto out;
+                brelse(bh);
+                err = omfs_unlink(new_dir, new_dentry);
+                if (err)
+                        goto out;
+        }
+        /* since omfs locates files by name, we need to unlink _before_
+         * adding the new link or we won't find the old one */
+        inode_inc_link_count(old_inode);
+        err = omfs_unlink(old_dir, old_dentry);
+        if (err) {
+                inode_dec_link_count(old_inode);
+                goto out;
+        }
+        err = omfs_add_link(new_dentry, old_inode);
+        if (err)
+                goto out;
+        old_inode->i_ctime = CURRENT_TIME_SEC;
+out:
+        return err;
+}
+static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct inode *dir = filp->f_dentry->d_inode;
+        struct buffer_head *bh;
+        loff_t offset, res;
+        unsigned int hchain, hindex;
+        int nbuckets;
+        u64 fsblock;
+        int ret = -EINVAL;
+        if (filp->f_pos >> 32)
+                goto success;
+        switch ((unsigned long) filp->f_pos) {
+        case 0:
+                if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+                        goto success;
+                filp->f_pos++;
+                /* fall through */
+        case 1:
+                if (filldir(dirent, "..", 2, 1,
+                    parent_ino(filp->f_dentry), DT_DIR) < 0)
+                        goto success;
+                filp->f_pos = 1 << 20;
+                /* fall through */
+        }
+        nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
+        /* high 12 bits store bucket + 1 and low 20 bits store hash index */
+        hchain = (filp->f_pos >> 20) - 1;
+        hindex = filp->f_pos & 0xfffff;
+        bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino));
+        if (!bh)
+                goto out;
+        offset = OMFS_DIR_START + hchain * 8;
+        for (; hchain < nbuckets; hchain++, offset += 8) {
+                fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+                res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
+                hindex = 0;
+                if (res < 0)
+                        break;
+                filp->f_pos = (hchain+2) << 20;
+        }
+        brelse(bh);
+success:
+        ret = 0;
+out:
+        return ret;
+}
+struct inode_operations omfs_dir_inops = {
+        .lookup = omfs_lookup,
+        .mkdir = omfs_mkdir,
+        .rename = omfs_rename,
+        .create = omfs_create,
+        .unlink = omfs_unlink,
+        .rmdir = omfs_rmdir,
+};
+struct file_operations omfs_dir_operations = {
+        .read = generic_read_dir,
+        .readdir = omfs_readdir,
+};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
new file mode 100644
index 000000000000..834b2331f6b3
--- /dev/null
+++ b/fs/omfs/file.c
@@ -0,0 +1,365 @@
+/*
+ * OMFS (as used by RIO Karma) file operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include "omfs.h"
+static int omfs_sync_file(struct file *file, struct dentry *dentry,
+                int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int err;
+        err = sync_mapping_buffers(inode->i_mapping);
+        if (!(inode->i_state & I_DIRTY))
+                return err;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return err;
+        err |= omfs_sync_inode(inode);
+        return err ? -EIO : 0;
+}
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+        return (sbi->s_sys_blocksize - offset -
+                sizeof(struct omfs_extent)) /
+                sizeof(struct omfs_extent_entry) + 1;
+}
+void omfs_make_empty_table(struct buffer_head *bh, int offset)
+{
+        struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
+        oe->e_next = ~cpu_to_be64(0ULL);
+        oe->e_extent_count = cpu_to_be32(1),
+        oe->e_fill = cpu_to_be32(0x22),
+        oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
+        oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+}
+int omfs_shrink_inode(struct inode *inode)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+        struct omfs_extent *oe;
+        struct omfs_extent_entry *entry;
+        struct buffer_head *bh;
+        u64 next, last;
+        u32 extent_count;
+        u32 max_extents;
+        int ret;
+        /* traverse extent table, freeing each entry that is greater
+         * than inode->i_size;
+         */
+        next = inode->i_ino;
+        /* only support truncate -> 0 for now */
+        ret = -EIO;
+        if (inode->i_size != 0)
+                goto out;
+        bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+        if (!bh)
+                goto out;
+        oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+        max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+        for (;;) {
+                if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+                        goto out_brelse;
+                extent_count = be32_to_cpu(oe->e_extent_count);
+                if (extent_count > max_extents)
+                        goto out_brelse;
+                last = next;
+                next = be64_to_cpu(oe->e_next);
+                entry = &oe->e_entry;
+                /* ignore last entry as it is the terminator */
+                for (; extent_count > 1; extent_count--) {
+                        u64 start, count;
+                        start = be64_to_cpu(entry->e_cluster);
+                        count = be64_to_cpu(entry->e_blocks);
+                        omfs_clear_range(inode->i_sb, start, (int) count);
+                        entry++;
+                }
+                omfs_make_empty_table(bh, (char *) oe - bh->b_data);
+                mark_buffer_dirty(bh);
+                brelse(bh);
+                if (last != inode->i_ino)
+                        omfs_clear_range(inode->i_sb, last, sbi->s_mirrors);
+                if (next == ~0)
+                        break;
+                bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+                if (!bh)
+                        goto out;
+                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+                max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+        }
+        ret = 0;
+out:
+        return ret;
+out_brelse:
+        brelse(bh);
+        return ret;
+}
+static void omfs_truncate(struct inode *inode)
+{
+        omfs_shrink_inode(inode);
+        mark_inode_dirty(inode);
+}
+/*
+ * Add new blocks to the current extent, or create new entries/continuations
+ * as necessary.
+ */
+static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
+                        u64 *ret_block)
+{
+        struct omfs_extent_entry *terminator;
+        struct omfs_extent_entry *entry = &oe->e_entry;
+        struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+        u32 extent_count = be32_to_cpu(oe->e_extent_count);
+        u64 new_block = 0;
+        u32 max_count;
+        int new_count;
+        int ret = 0;
+        /* reached the end of the extent table with no blocks mapped.
+         * there are three possibilities for adding: grow last extent,
+         * add a new extent to the current extent table, and add a
+         * continuation inode.  in last two cases need an allocator for
+         * sbi->s_cluster_size
+         */
+        /* TODO: handle holes */
+        /* should always have a terminator */
+        if (extent_count < 1)
+                return -EIO;
+        /* trivially grow current extent, if next block is not taken */
+        terminator = entry + extent_count - 1;
+        if (extent_count > 1) {
+                entry = terminator-1;
+                new_block = be64_to_cpu(entry->e_cluster) +
+                        be64_to_cpu(entry->e_blocks);
+                if (omfs_allocate_block(inode->i_sb, new_block)) {
+                        entry->e_blocks =
+                                cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
+                        terminator->e_blocks = ~(cpu_to_be64(
+                                be64_to_cpu(~terminator->e_blocks) + 1));
+                        goto out;
+                }
+        }
+        max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
+        /* TODO: add a continuation block here */
+        if (be32_to_cpu(oe->e_extent_count) > max_count-1)
+                return -EIO;
+        /* try to allocate a new cluster */
+        ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize,
+                &new_block, &new_count);
+        if (ret)
+                goto out_fail;
+        /* copy terminator down an entry */
+        entry = terminator;
+        terminator++;
+        memcpy(terminator, entry, sizeof(struct omfs_extent_entry));
+        entry->e_cluster = cpu_to_be64(new_block);
+        entry->e_blocks = cpu_to_be64((u64) new_count);
+        terminator->e_blocks = ~(cpu_to_be64(
+                be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
+        /* write in new entry */
+        oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count));
+out:
+        *ret_block = new_block;
+out_fail:
+        return ret;
+}
+/*
+ * Scans across the directory table for a given file block number.
+ * If block not found, return 0.
+ */
+static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent,
+                        sector_t block, int count, int *left)
+{
+        /* count > 1 because of terminator */
+        sector_t searched = 0;
+        for (; count > 1; count--) {
+                int numblocks = clus_to_blk(OMFS_SB(inode->i_sb),
+                        be64_to_cpu(ent->e_blocks));
+                if (block >= searched  &&
+                    block < searched + numblocks) {
+                        /*
+                         * found it at cluster + (block - searched)
+                         * numblocks - (block - searched) is remainder
+                         */
+                        *left = numblocks - (block - searched);
+                        return clus_to_blk(OMFS_SB(inode->i_sb),
+                                be64_to_cpu(ent->e_cluster)) +
+                                block - searched;
+                }
+                searched += numblocks;
+                ent++;
+        }
+        return 0;
+}
+static int omfs_get_block(struct inode *inode, sector_t block,
+                          struct buffer_head *bh_result, int create)
+{
+        struct buffer_head *bh;
+        sector_t next, offset;
+        int ret;
+        u64 new_block;
+        u32 max_extents;
+        int extent_count;
+        struct omfs_extent *oe;
+        struct omfs_extent_entry *entry;
+        struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+        int max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int remain;
+        ret = -EIO;
+        bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino));
+        if (!bh)
+                goto out;
+        oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+        max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+        next = inode->i_ino;
+        for (;;) {
+                if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+                        goto out_brelse;
+                extent_count = be32_to_cpu(oe->e_extent_count);
+                next = be64_to_cpu(oe->e_next);
+                entry = &oe->e_entry;
+                if (extent_count > max_extents)
+                        goto out_brelse;
+                offset = find_block(inode, entry, block, extent_count, &remain);
+                if (offset > 0) {
+                        ret = 0;
+                        map_bh(bh_result, inode->i_sb, offset);
+                        if (remain > max_blocks)
+                                remain = max_blocks;
+                        bh_result->b_size = (remain << inode->i_blkbits);
+                        goto out_brelse;
+                }
+                if (next == ~0)
+                        break;
+                brelse(bh);
+                bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+                if (!bh)
+                        goto out;
+                oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+                max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+        }
+        if (create) {
+                ret = omfs_grow_extent(inode, oe, &new_block);
+                if (ret == 0) {
+                        mark_buffer_dirty(bh);
+                        mark_inode_dirty(inode);
+                        map_bh(bh_result, inode->i_sb,
+                                        clus_to_blk(sbi, new_block));
+                }
+        }
+out_brelse:
+        brelse(bh);
+out:
+        return ret;
+}
+static int omfs_readpage(struct file *file, struct page *page)
+{
+        return block_read_full_page(page, omfs_get_block);
+}
+static int omfs_readpages(struct file *file, struct address_space *mapping,
+                struct list_head *pages, unsigned nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+}
+static int omfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        return block_write_full_page(page, omfs_get_block, wbc);
+}
+static int
+omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+        return mpage_writepages(mapping, wbc, omfs_get_block);
+}
+static int omfs_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
+{
+        *pagep = NULL;
+        return block_write_begin(file, mapping, pos, len, flags,
+                                pagep, fsdata, omfs_get_block);
+}
+static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
+{
+        return generic_block_bmap(mapping, block, omfs_get_block);
+}
+struct file_operations omfs_file_operations = {
+        .llseek = generic_file_llseek,
+        .read = do_sync_read,
+        .write = do_sync_write,
+        .aio_read = generic_file_aio_read,
+        .aio_write = generic_file_aio_write,
+        .mmap = generic_file_mmap,
+        .fsync = omfs_sync_file,
+        .splice_read = generic_file_splice_read,
+};
+struct inode_operations omfs_file_inops = {
+        .truncate = omfs_truncate
+};
+struct address_space_operations omfs_aops = {
+        .readpage = omfs_readpage,
+        .readpages = omfs_readpages,
+        .writepage = omfs_writepage,
+        .writepages = omfs_writepages,
+        .sync_page = block_sync_page,
+        .write_begin = omfs_write_begin,
+        .write_end = generic_write_end,
+        .bmap = omfs_bmap,
+};
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
new file mode 100644
index 000000000000..d29047b1b9b0
--- /dev/null
+++ b/fs/omfs/inode.c
@@ -0,0 +1,553 @@
+/*
+ * Optimized MPEG FS - inode and super operations.
+ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
+#include <linux/crc-itu-t.h>
+#include "omfs.h"
+MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
+MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
+MODULE_LICENSE("GPL");
+struct inode *omfs_new_inode(struct inode *dir, int mode)
+{
+        struct inode *inode;
+        u64 new_block;
+        int err;
+        int len;
+        struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb);
+        inode = new_inode(dir->i_sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors,
+                        &new_block, &len);
+        if (err)
+                goto fail;
+        inode->i_ino = new_block;
+        inode->i_mode = mode;
+        inode->i_uid = current->fsuid;
+        inode->i_gid = current->fsgid;
+        inode->i_blocks = 0;
+        inode->i_mapping->a_ops = &omfs_aops;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        switch (mode & S_IFMT) {
+        case S_IFDIR:
+                inode->i_op = &omfs_dir_inops;
+                inode->i_fop = &omfs_dir_operations;
+                inode->i_size = sbi->s_sys_blocksize;
+                inc_nlink(inode);
+                break;
+        case S_IFREG:
+                inode->i_op = &omfs_file_inops;
+                inode->i_fop = &omfs_file_operations;
+                inode->i_size = 0;
+                break;
+        }
+        insert_inode_hash(inode);
+        mark_inode_dirty(inode);
+        return inode;
+fail:
+        make_bad_inode(inode);
+        iput(inode);
+        return ERR_PTR(err);
+}
+/*
+ * Update the header checksums for a dirty inode based on its contents.
+ * Caller is expected to hold the buffer head underlying oi and mark it
+ * dirty.
+ */
+static void omfs_update_checksums(struct omfs_inode *oi)
+{
+        int xor, i, ofs = 0, count;
+        u16 crc = 0;
+        unsigned char *ptr = (unsigned char *) oi;
+        count = be32_to_cpu(oi->i_head.h_body_size);
+        ofs = sizeof(struct omfs_header);
+        crc = crc_itu_t(crc, ptr + ofs, count);
+        oi->i_head.h_crc = cpu_to_be16(crc);
+        xor = ptr[0];
+        for (i = 1; i < OMFS_XOR_COUNT; i++)
+                xor ^= ptr[i];
+        oi->i_head.h_check_xor = xor;
+}
+static int omfs_write_inode(struct inode *inode, int wait)
+{
+        struct omfs_inode *oi;
+        struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+        struct buffer_head *bh, *bh2;
+        unsigned int block;
+        u64 ctime;
+        int i;
+        int ret = -EIO;
+        int sync_failed = 0;
+        /* get current inode since we may have written sibling ptrs etc. */
+        block = clus_to_blk(sbi, inode->i_ino);
+        bh = sb_bread(inode->i_sb, block);
+        if (!bh)
+                goto out;
+        oi = (struct omfs_inode *) bh->b_data;
+        oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+        if (S_ISDIR(inode->i_mode))
+                oi->i_type = OMFS_DIR;
+        else if (S_ISREG(inode->i_mode))
+                oi->i_type = OMFS_FILE;
+        else {
+                printk(KERN_WARNING "omfs: unknown file type: %d\n",
+                        inode->i_mode);
+                goto out_brelse;
+        }
+        oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize -
+                sizeof(struct omfs_header));
+        oi->i_head.h_version = 1;
+        oi->i_head.h_type = OMFS_INODE_NORMAL;
+        oi->i_head.h_magic = OMFS_IMAGIC;
+        oi->i_size = cpu_to_be64(inode->i_size);
+        ctime = inode->i_ctime.tv_sec * 1000LL +
+                ((inode->i_ctime.tv_nsec + 999)/1000);
+        oi->i_ctime = cpu_to_be64(ctime);
+        omfs_update_checksums(oi);
+        mark_buffer_dirty(bh);
+        if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        sync_failed = 1;
+        }
+        /* if mirroring writes, copy to next fsblock */
+        for (i = 1; i < sbi->s_mirrors; i++) {
+                bh2 = sb_bread(inode->i_sb, block + i *
+                        (sbi->s_blocksize / sbi->s_sys_blocksize));
+                if (!bh2)
+                        goto out_brelse;
+                memcpy(bh2->b_data, bh->b_data, bh->b_size);
+                mark_buffer_dirty(bh2);
+                if (wait) {
+                        sync_dirty_buffer(bh2);
+                        if (buffer_req(bh2) && !buffer_uptodate(bh2))
+                                sync_failed = 1;
+                }
+                brelse(bh2);
+        }
+        ret = (sync_failed) ? -EIO : 0;
+out_brelse:
+        brelse(bh);
+out:
+        return ret;
+}
+int omfs_sync_inode(struct inode *inode)
+{
+        return omfs_write_inode(inode, 1);
+}
+/*
+ * called when an entry is deleted, need to clear the bits in the
+ * bitmaps.
+ */
+static void omfs_delete_inode(struct inode *inode)
+{
+        truncate_inode_pages(&inode->i_data, 0);
+        if (S_ISREG(inode->i_mode)) {
+                inode->i_size = 0;
+                omfs_shrink_inode(inode);
+        }
+        omfs_clear_range(inode->i_sb, inode->i_ino, 2);
+        clear_inode(inode);
+}
+struct inode *omfs_iget(struct super_block *sb, ino_t ino)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        struct omfs_inode *oi;
+        struct buffer_head *bh;
+        unsigned int block;
+        u64 ctime;
+        unsigned long nsecs;
+        struct inode *inode;
+        inode = iget_locked(sb, ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        block = clus_to_blk(sbi, ino);
+        bh = sb_bread(inode->i_sb, block);
+        if (!bh)
+                goto iget_failed;
+        oi = (struct omfs_inode *)bh->b_data;
+        /* check self */
+        if (ino != be64_to_cpu(oi->i_head.h_self))
+                goto fail_bh;
+        inode->i_uid = sbi->s_uid;
+        inode->i_gid = sbi->s_gid;
+        ctime = be64_to_cpu(oi->i_ctime);
+        nsecs = do_div(ctime, 1000) * 1000L;
+        inode->i_atime.tv_sec = ctime;
+        inode->i_mtime.tv_sec = ctime;
+        inode->i_ctime.tv_sec = ctime;
+        inode->i_atime.tv_nsec = nsecs;
+        inode->i_mtime.tv_nsec = nsecs;
+        inode->i_ctime.tv_nsec = nsecs;
+        inode->i_mapping->a_ops = &omfs_aops;
+        switch (oi->i_type) {
+        case OMFS_DIR:
+                inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
+                inode->i_op = &omfs_dir_inops;
+                inode->i_fop = &omfs_dir_operations;
+                inode->i_size = sbi->s_sys_blocksize;
+                inc_nlink(inode);
+                break;
+        case OMFS_FILE:
+                inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask);
+                inode->i_fop = &omfs_file_operations;
+                inode->i_size = be64_to_cpu(oi->i_size);
+                break;
+        }
+        brelse(bh);
+        unlock_new_inode(inode);
+        return inode;
+fail_bh:
+        brelse(bh);
+iget_failed:
+        iget_failed(inode);
+        return ERR_PTR(-EIO);
+}
+static void omfs_put_super(struct super_block *sb)
+{
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        kfree(sbi->s_imap);
+        kfree(sbi);
+        sb->s_fs_info = NULL;
+}
+static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *s = dentry->d_sb;
+        struct omfs_sb_info *sbi = OMFS_SB(s);
+        buf->f_type = OMFS_MAGIC;
+        buf->f_bsize = sbi->s_blocksize;
+        buf->f_blocks = sbi->s_num_blocks;
+        buf->f_files = sbi->s_num_blocks;
+        buf->f_namelen = OMFS_NAMELEN;
+        buf->f_bfree = buf->f_bavail = buf->f_ffree =
+                omfs_count_free(s);
+        return 0;
+}
+static struct super_operations omfs_sops = {
+        .write_inode    = omfs_write_inode,
+        .delete_inode   = omfs_delete_inode,
+        .put_super      = omfs_put_super,
+        .statfs         = omfs_statfs,
+        .show_options   = generic_show_options,
+};
+/*
+ * For Rio Karma, there is an on-disk free bitmap whose location is
+ * stored in the root block.  For ReplayTV, there is no such free bitmap
+ * so we have to walk the tree.  Both inodes and file data are allocated
+ * from the same map.  This array can be big (300k) so we allocate
+ * in units of the blocksize.
+ */
+static int omfs_get_imap(struct super_block *sb)
+{
+        int bitmap_size;
+        int array_size;
+        int count;
+        struct omfs_sb_info *sbi = OMFS_SB(sb);
+        struct buffer_head *bh;
+        unsigned long **ptr;
+        sector_t block;
+        bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8);
+        array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize);
+        if (sbi->s_bitmap_ino == ~0ULL)
+                goto out;
+        sbi->s_imap_size = array_size;
+        sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+        if (!sbi->s_imap)
+                goto nomem;
+        block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+        ptr = sbi->s_imap;
+        for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
+                bh = sb_bread(sb, block++);
+                if (!bh)
+                        goto nomem_free;
+                *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+                if (!*ptr) {
+                        brelse(bh);
+                        goto nomem_free;
+                }
+                memcpy(*ptr, bh->b_data, sb->s_blocksize);
+                if (count < sb->s_blocksize)
+                        memset((void *)*ptr + count, 0xff,
+                                sb->s_blocksize - count);
+                brelse(bh);
+                ptr++;
+        }
+out:
+        return 0;
+nomem_free:
+        for (count = 0; count < array_size; count++)
+                kfree(sbi->s_imap[count]);
+        kfree(sbi->s_imap);
+nomem:
+        sbi->s_imap = NULL;
+        sbi->s_imap_size = 0;
+        return -ENOMEM;
+}
+enum {
+        Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
+};
+static match_table_t tokens = {
+        {Opt_uid, "uid=%u"},
+        {Opt_gid, "gid=%u"},
+        {Opt_umask, "umask=%o"},
+        {Opt_dmask, "dmask=%o"},
+        {Opt_fmask, "fmask=%o"},
+};
+static int parse_options(char *options, struct omfs_sb_info *sbi)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_uid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        sbi->s_uid = option;
+                        break;
+                case Opt_gid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        sbi->s_gid = option;
+                        break;
+                case Opt_umask:
+                        if (match_octal(&args[0], &option))
+                                return 0;
+                        sbi->s_fmask = sbi->s_dmask = option;
+                        break;
+                case Opt_dmask:
+                        if (match_octal(&args[0], &option))
+                                return 0;
+                        sbi->s_dmask = option;
+                        break;
+                case Opt_fmask:
+                        if (match_octal(&args[0], &option))
+                                return 0;
+                        sbi->s_fmask = option;
+                        break;
+                default:
+                        return 0;
+                }
+        }
+        return 1;
+}
+static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct buffer_head *bh, *bh2;
+        struct omfs_super_block *omfs_sb;
+        struct omfs_root_block *omfs_rb;
+        struct omfs_sb_info *sbi;
+        struct inode *root;
+        sector_t start;
+        int ret = -EINVAL;
+        save_mount_options(sb, (char *) data);
+        sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
+        if (!sbi)
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        sbi->s_uid = current->uid;
+        sbi->s_gid = current->gid;
+        sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+        if (!parse_options((char *) data, sbi))
+                goto end;
+        sb->s_maxbytes = 0xffffffff;
+        sb_set_blocksize(sb, 0x200);
+        bh = sb_bread(sb, 0);
+        if (!bh)
+                goto end;
+        omfs_sb = (struct omfs_super_block *)bh->b_data;
+        if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) {
+                if (!silent)
+                        printk(KERN_ERR "omfs: Invalid superblock (%x)\n",
+                                   omfs_sb->s_magic);
+                goto out_brelse_bh;
+        }
+        sb->s_magic = OMFS_MAGIC;
+        sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks);
+        sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize);
+        sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors);
+        sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block);
+        sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
+        mutex_init(&sbi->s_bitmap_lock);
+        if (sbi->s_sys_blocksize > PAGE_SIZE) {
+                printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
+                        sbi->s_sys_blocksize);
+                goto out_brelse_bh;
+        }
+        if (sbi->s_blocksize < sbi->s_sys_blocksize ||
+            sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) {
+                printk(KERN_ERR "omfs: block size (%d) is out of range\n",
+                        sbi->s_blocksize);
+                goto out_brelse_bh;
+        }
+        /*
+         * Use sys_blocksize as the fs block since it is smaller than a
+         * page while the fs blocksize can be larger.
+         */
+        sb_set_blocksize(sb, sbi->s_sys_blocksize);
+        /*
+         * ...and the difference goes into a shift.  sys_blocksize is always
+         * a power of two factor of blocksize.
+         */
+        sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
+                get_bitmask_order(sbi->s_sys_blocksize);
+        start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block));
+        bh2 = sb_bread(sb, start);
+        if (!bh2)
+                goto out_brelse_bh;
+        omfs_rb = (struct omfs_root_block *)bh2->b_data;
+        sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap);
+        sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize);
+        if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) {
+                printk(KERN_ERR "omfs: block count discrepancy between "
+                        "super and root blocks (%llx, %llx)\n",
+                        (unsigned long long)sbi->s_num_blocks,
+                        (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks));
+                goto out_brelse_bh2;
+        }
+        ret = omfs_get_imap(sb);
+        if (ret)
+                goto out_brelse_bh2;
+        sb->s_op = &omfs_sops;
+        root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir));
+        if (IS_ERR(root)) {
+                ret = PTR_ERR(root);
+                goto out_brelse_bh2;
+        }
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                iput(root);
+                goto out_brelse_bh2;
+        }
+        printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
+        ret = 0;
+out_brelse_bh2:
+        brelse(bh2);
+out_brelse_bh:
+        brelse(bh);
+end:
+        return ret;
+}
+static int omfs_get_sb(struct file_system_type *fs_type,
+                        int flags, const char *dev_name,
+                        void *data, struct vfsmount *m)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+}
+static struct file_system_type omfs_fs_type = {
+        .owner = THIS_MODULE,
+        .name = "omfs",
+        .get_sb = omfs_get_sb,
+        .kill_sb = kill_block_super,
+        .fs_flags = FS_REQUIRES_DEV,
+};
+static int __init init_omfs_fs(void)
+{
+        return register_filesystem(&omfs_fs_type);
+}
+static void __exit exit_omfs_fs(void)
+{
+        unregister_filesystem(&omfs_fs_type);
+}
+module_init(init_omfs_fs);
+module_exit(exit_omfs_fs);
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
new file mode 100644
index 000000000000..2bc0f0670406
--- /dev/null
+++ b/fs/omfs/omfs.h
@@ -0,0 +1,67 @@
+#ifndef _OMFS_H
+#define _OMFS_H
+#include <linux/module.h>
+#include <linux/fs.h>
+#include "omfs_fs.h"
+/* In-memory structures */
+struct omfs_sb_info {
+        u64 s_num_blocks;
+        u64 s_bitmap_ino;
+        u64 s_root_ino;
+        u32 s_blocksize;
+        u32 s_mirrors;
+        u32 s_sys_blocksize;
+        u32 s_clustersize;
+        int s_block_shift;
+        unsigned long **s_imap;
+        int s_imap_size;
+        struct mutex s_bitmap_lock;
+        int s_uid;
+        int s_gid;
+        int s_dmask;
+        int s_fmask;
+};
+/* convert a cluster number to a scaled block number */
+static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block)
+{
+        return block << sbi->s_block_shift;
+}
+static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+/* bitmap.c */
+extern unsigned long omfs_count_free(struct super_block *sb);
+extern int omfs_allocate_block(struct super_block *sb, u64 block);
+extern int omfs_allocate_range(struct super_block *sb, int min_request,
+                        int max_request, u64 *return_block, int *return_size);
+extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
+/* dir.c */
+extern struct file_operations omfs_dir_operations;
+extern struct inode_operations omfs_dir_inops;
+extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
+extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+                        u64 fsblock);
+/* file.c */
+extern struct file_operations omfs_file_operations;
+extern struct inode_operations omfs_file_inops;
+extern struct address_space_operations omfs_aops;
+extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
+extern int omfs_shrink_inode(struct inode *inode);
+/* inode.c */
+extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
+extern struct inode *omfs_new_inode(struct inode *dir, int mode);
+extern int omfs_reserve_block(struct super_block *sb, sector_t block);
+extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
+extern int omfs_sync_inode(struct inode *inode);
+#endif
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
new file mode 100644
index 000000000000..12cca245d6e8
--- /dev/null
+++ b/fs/omfs/omfs_fs.h
@@ -0,0 +1,80 @@
+#ifndef _OMFS_FS_H
+#define _OMFS_FS_H
+/* OMFS On-disk structures */
+#define OMFS_MAGIC 0xC2993D87
+#define OMFS_IMAGIC 0xD2
+#define OMFS_DIR 'D'
+#define OMFS_FILE 'F'
+#define OMFS_INODE_NORMAL 'e'
+#define OMFS_INODE_CONTINUATION 'c'
+#define OMFS_INODE_SYSTEM 's'
+#define OMFS_NAMELEN 256
+#define OMFS_DIR_START 0x1b8
+#define OMFS_EXTENT_START 0x1d0
+#define OMFS_EXTENT_CONT 0x40
+#define OMFS_XOR_COUNT 19
+#define OMFS_MAX_BLOCK_SIZE 8192
+struct omfs_super_block {
+        char s_fill1[256];
+        __be64 s_root_block;            /* block number of omfs_root_block */
+        __be64 s_num_blocks;            /* total number of FS blocks */
+        __be32 s_magic;                 /* OMFS_MAGIC */
+        __be32 s_blocksize;             /* size of a block */
+        __be32 s_mirrors;               /* # of mirrors of system blocks */
+        __be32 s_sys_blocksize;         /* size of non-data blocks */
+};
+struct omfs_header {
+        __be64 h_self;                  /* FS block where this is located */
+        __be32 h_body_size;             /* size of useful data after header */
+        __be16 h_crc;                   /* crc-ccitt of body_size bytes */
+        char h_fill1[2];
+        u8 h_version;                   /* version, always 1 */
+        char h_type;                    /* OMFS_INODE_X */
+        u8 h_magic;                     /* OMFS_IMAGIC */
+        u8 h_check_xor;                 /* XOR of header bytes before this */
+        __be32 h_fill2;
+};
+struct omfs_root_block {
+        struct omfs_header r_head;      /* header */
+        __be64 r_fill1;
+        __be64 r_num_blocks;            /* total number of FS blocks */
+        __be64 r_root_dir;              /* block # of root directory */
+        __be64 r_bitmap;                /* block # of free space bitmap */
+        __be32 r_blocksize;             /* size of a block */
+        __be32 r_clustersize;           /* size allocated for data blocks */
+        __be64 r_mirrors;               /* # of mirrors of system blocks */
+        char r_name[OMFS_NAMELEN];      /* partition label */
+};
+struct omfs_inode {
+        struct omfs_header i_head;      /* header */
+        __be64 i_parent;                /* parent containing this inode */
+        __be64 i_sibling;               /* next inode in hash bucket */
+        __be64 i_ctime;                 /* ctime, in milliseconds */
+        char i_fill1[35];
+        char i_type;                    /* OMFS_[DIR,FILE] */
+        __be32 i_fill2;
+        char i_fill3[64];
+        char i_name[OMFS_NAMELEN];      /* filename */
+        __be64 i_size;                  /* size of file, in bytes */
+};
+struct omfs_extent_entry {
+        __be64 e_cluster;               /* start location of a set of blocks */
+        __be64 e_blocks;                /* number of blocks after e_cluster */
+};
+struct omfs_extent {
+        __be64 e_next;                  /* next extent table location */
+        __be32 e_extent_count;          /* total # extents in this table */
+        __be32 e_fill;
+        struct omfs_extent_entry e_entry;       /* start of extent entries */
+};
+#endif
diff --git a/fs/open.c b/fs/open.c
index bb98d2fe809f..07da9359481c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -122,37 +122,37 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
        return 0;
 }
-asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
+asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (!error) {
                struct statfs tmp;
-                error = vfs_statfs_native(nd.path.dentry, &tmp);
+                error = vfs_statfs_native(path.dentry, &tmp);
                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
                        error = -EFAULT;
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
-asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 __user *buf)
+asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
 {
-        struct nameidata nd;
+        struct path path;
        long error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (!error) {
                struct statfs64 tmp;
-                error = vfs_statfs64(nd.path.dentry, &tmp);
+                error = vfs_statfs64(path.dentry, &tmp);
                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
                        error = -EFAULT;
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
@@ -223,20 +223,20 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        return err;
 }
-static long do_sys_truncate(const char __user * path, loff_t length)
+static long do_sys_truncate(const char __user *pathname, loff_t length)
 {
-        struct nameidata nd;
+        struct path path;
-        struct inode * inode;
+        struct inode *inode;
        int error;
        error = -EINVAL;
        if (length < 0) /* sorry, but loff_t says... */
                goto out;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (error)
                goto out;
-        inode = nd.path.dentry->d_inode;
+        inode = path.dentry->d_inode;
        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        error = -EISDIR;
@@ -247,16 +247,16 @@ static long do_sys_truncate(const char __user * path, loff_t length)
        if (!S_ISREG(inode->i_mode))
                goto dput_and_out;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (error)
                goto dput_and_out;
-        error = vfs_permission(&nd, MAY_WRITE);
+        error = inode_permission(inode, MAY_WRITE);
        if (error)
                goto mnt_drop_write_and_out;
        error = -EPERM;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;
        error = get_write_access(inode);
@@ -274,15 +274,15 @@ static long do_sys_truncate(const char __user * path, loff_t length)
        error = locks_verify_truncate(inode, NULL, length);
        if (!error) {
                DQUOT_INIT(inode);
-                error = do_truncate(nd.path.dentry, length, 0, NULL);
+                error = do_truncate(path.dentry, length, 0, NULL);
        }
 put_write_and_out:
        put_write_access(inode);
 mnt_drop_write_and_out:
-        mnt_drop_write(nd.path.mnt);
+        mnt_drop_write(path.mnt);
 dput_and_out:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
@@ -425,7 +425,8 @@ out:
 */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
-        struct nameidata nd;
+        struct path path;
+        struct inode *inode;
        int old_fsuid, old_fsgid;
        kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
        int res;
@@ -448,7 +449,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
                 * FIXME: There is a race here against sys_capset.  The
                 * capabilities can change yet we will restore the old
                 * value below.  We should hold task_capabilities_lock,
-                 * but we cannot because user_path_walk can sleep.
+                 * but we cannot because user_path_at can sleep.
                 */
 #endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
                if (current->uid)
@@ -457,14 +458,25 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
                        old_cap = cap_set_effective(current->cap_permitted);
        }
-        res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+        res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
        if (res)
                goto out;
-        res = vfs_permission(&nd, mode);
+        inode = path.dentry->d_inode;
+        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+                /*
+                 * MAY_EXEC on regular files is denied if the fs is mounted
+                 * with the "noexec" flag.
+                 */
+                res = -EACCES;
+                if (path.mnt->mnt_flags & MNT_NOEXEC)
+                        goto out_path_release;
+        }
+        res = inode_permission(inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
-        if(res || !(mode & S_IWOTH) ||
+        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
-           special_file(nd.path.dentry->d_inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
@@ -476,11 +488,11 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
-        if (__mnt_is_readonly(nd.path.mnt))
+        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;
 out_path_release:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        current->fsuid = old_fsuid;
        current->fsgid = old_fsgid;
@@ -498,22 +510,21 @@ asmlinkage long sys_access(const char __user *filename, int mode)
 asmlinkage long sys_chdir(const char __user * filename)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = __user_walk(filename,
+        error = user_path_dir(filename, &path);
-                            LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd);
        if (error)
                goto out;
-        error = vfs_permission(&nd, MAY_EXEC);
+        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
        if (error)
                goto dput_and_out;
-        set_fs_pwd(current->fs, &nd.path);
+        set_fs_pwd(current->fs, &path);
 dput_and_out:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
@@ -535,7 +546,7 @@ asmlinkage long sys_fchdir(unsigned int fd)
        if (!S_ISDIR(inode->i_mode))
                goto out_putf;
-        error = file_permission(file, MAY_EXEC);
+        error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
        if (!error)
                set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -546,14 +557,14 @@ out:
 asmlinkage long sys_chroot(const char __user * filename)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+        error = user_path_dir(filename, &path);
        if (error)
                goto out;
-        error = vfs_permission(&nd, MAY_EXEC);
+        error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
        if (error)
                goto dput_and_out;
@@ -561,11 +572,10 @@ asmlinkage long sys_chroot(const char __user * filename)
        if (!capable(CAP_SYS_CHROOT))
                goto dput_and_out;
-        set_fs_root(current->fs, &nd.path);
+        set_fs_root(current->fs, &path);
-        set_fs_altroot();
        error = 0;
 dput_and_out:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
@@ -590,9 +600,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
        err = mnt_want_write(file->f_path.mnt);
        if (err)
                goto out_putf;
-        err = -EPERM;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto out_drop_write;
        mutex_lock(&inode->i_mutex);
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
@@ -600,8 +607,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        err = notify_change(dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
-out_drop_write:
        mnt_drop_write(file->f_path.mnt);
 out_putf:
        fput(file);
@@ -612,36 +617,29 @@ out:
 asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
                             mode_t mode)
 {
-        struct nameidata nd;
+        struct path path;
-        struct inode * inode;
+        struct inode *inode;
        int error;
        struct iattr newattrs;
-        error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
+        error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
        if (error)
                goto out;
-        inode = nd.path.dentry->d_inode;
+        inode = path.dentry->d_inode;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (error)
                goto dput_and_out;
-        error = -EPERM;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto out_drop_write;
        mutex_lock(&inode->i_mutex);
        if (mode == (mode_t) -1)
                mode = inode->i_mode;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-        error = notify_change(nd.path.dentry, &newattrs);
+        error = notify_change(path.dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
+        mnt_drop_write(path.mnt);
-out_drop_write:
-        mnt_drop_write(nd.path.mnt);
 dput_and_out:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
@@ -653,18 +651,10 @@ asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
 static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 {
-        struct inode * inode;
+        struct inode *inode = dentry->d_inode;
        int error;
        struct iattr newattrs;
-        error = -ENOENT;
-        if (!(inode = dentry->d_inode)) {
-                printk(KERN_ERR "chown_common: NULL inode\n");
-                goto out;
-        }
-        error = -EPERM;
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                goto out;
        newattrs.ia_valid =  ATTR_CTIME;
        if (user != (uid_t) -1) {
                newattrs.ia_valid |= ATTR_UID;
@@ -680,25 +670,25 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
        mutex_lock(&inode->i_mutex);
        error = notify_change(dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
-out:
        return error;
 }
 asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk(filename, &nd);
+        error = user_path(filename, &path);
        if (error)
                goto out;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(nd.path.dentry, user, group);
+        error = chown_common(path.dentry, user, group);
-        mnt_drop_write(nd.path.mnt);
+        mnt_drop_write(path.mnt);
 out_release:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
@@ -706,7 +696,7 @@ out:
 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
                             gid_t group, int flag)
 {
-        struct nameidata nd;
+        struct path path;
        int error = -EINVAL;
        int follow;
@@ -714,35 +704,35 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
                goto out;
        follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-        error = __user_walk_fd(dfd, filename, follow, &nd);
+        error = user_path_at(dfd, filename, follow, &path);
        if (error)
                goto out;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(nd.path.dentry, user, group);
+        error = chown_common(path.dentry, user, group);
-        mnt_drop_write(nd.path.mnt);
+        mnt_drop_write(path.mnt);
 out_release:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
 asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk_link(filename, &nd);
+        error = user_lpath(filename, &path);
        if (error)
                goto out;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
-        error = chown_common(nd.path.dentry, user, group);
+        error = chown_common(path.dentry, user, group);
-        mnt_drop_write(nd.path.mnt);
+        mnt_drop_write(path.mnt);
 out_release:
-        path_put(&nd.path);
+        path_put(&path);
 out:
        return error;
 }
@@ -973,71 +963,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 }
 EXPORT_SYMBOL(dentry_open);
-/*
- * Find an empty file descriptor entry, and mark it busy.
- */
-int get_unused_fd_flags(int flags)
-{
-        struct files_struct * files = current->files;
-        int fd, error;
-        struct fdtable *fdt;
-        error = -EMFILE;
-        spin_lock(&files->file_lock);
-repeat:
-        fdt = files_fdtable(files);
-        fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
-                                files->next_fd);
-        /*
-         * N.B. For clone tasks sharing a files structure, this test
-         * will limit the total number of files that can be opened.
-         */
-        if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-                goto out;
-        /* Do we need to expand the fd array or fd set?  */
-        error = expand_files(files, fd);
-        if (error < 0)
-                goto out;
-        if (error) {
-                /*
-                 * If we needed to expand the fs array we
-                 * might have blocked - try again.
-                 */
-                error = -EMFILE;
-                goto repeat;
-        }
-        FD_SET(fd, fdt->open_fds);
-        if (flags & O_CLOEXEC)
-                FD_SET(fd, fdt->close_on_exec);
-        else
-                FD_CLR(fd, fdt->close_on_exec);
-        files->next_fd = fd + 1;
-#if 1
-        /* Sanity check */
-        if (fdt->fd[fd] != NULL) {
-                printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
-                fdt->fd[fd] = NULL;
-        }
-#endif
-        error = fd;
-out:
-        spin_unlock(&files->file_lock);
-        return error;
-}
-int get_unused_fd(void)
-{
-        return get_unused_fd_flags(0);
-}
-EXPORT_SYMBOL(get_unused_fd);
 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
        struct fdtable *fdt = files_fdtable(files);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e1..9f5b054f06b9 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = {
        .kill_sb        = kill_anon_super,
 };
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
 {
        struct op_inode_info *oi = (struct op_inode_info *) data;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index efef715135d3..ecc3330972e5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -344,18 +344,18 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
                   whole_disk_show, NULL);
-void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
 {
        struct hd_struct *p;
        int err;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
-                return;
+                return -ENOMEM;
        if (!init_part_stats(p)) {
-                kfree(p);
+                err = -ENOMEM;
-                return;
+                goto out0;
        }
        p->start_sect = start;
        p->nr_sects = len;
@@ -378,15 +378,31 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
        /* delay uevent until 'holders' subdir is created */
        p->dev.uevent_suppress = 1;
-        device_add(&p->dev);
+        err = device_add(&p->dev);
+        if (err)
+                goto out1;
        partition_sysfs_add_subdir(p);
        p->dev.uevent_suppress = 0;
-        if (flags & ADDPART_FLAG_WHOLEDISK)
+        if (flags & ADDPART_FLAG_WHOLEDISK) {
                err = device_create_file(&p->dev, &dev_attr_whole_disk);
+                if (err)
+                        goto out2;
+        }
        /* suppress uevent if the disk supresses it */
        if (!disk->dev.uevent_suppress)
                kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+        return 0;
+out2:
+        device_del(&p->dev);
+out1:
+        put_device(&p->dev);
+        free_part_stats(p);
+out0:
+        kfree(p);
+        return err;
 }
 /* Not exported, helper to add_disk(). */
@@ -483,10 +499,16 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
                if (!size)
                        continue;
                if (from + size > get_capacity(disk)) {
-                        printk(" %s: p%d exceeds device capacity\n",
+                        printk(KERN_WARNING
+                                "%s: p%d exceeds device capacity\n",
                                disk->disk_name, p);
                }
-                add_partition(disk, p, from, size, state->parts[p].flags);
+                res = add_partition(disk, p, from, size, state->parts[p].flags);
+                if (res) {
+                        printk(KERN_ERR " %s: p%d could not be added: %d\n",
+                                disk->disk_name, p, -res);
+                        continue;
+                }
 #ifdef CONFIG_BLK_DEV_MD
                if (state->parts[p].flags & ADDPART_FLAG_RAID)
                        md_autodetect_dev(bdev->bd_dev+p);
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index e7b07006bc41..038a6022152f 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,13 +95,6 @@
 #include "check.h"
 #include "efi.h"
-#undef EFI_DEBUG
-#ifdef EFI_DEBUG
-#define Dprintk(x...) printk(KERN_DEBUG x)
-#else
-#define Dprintk(x...)
-#endif
 /* This allows a kernel command line option 'gpt' to override
 * the test for invalid PMBR.  Not __initdata because reloading
 * the partition tables happens after init too.
@@ -305,10 +298,10 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        /* Check the GUID Partition Table signature */
        if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
-                Dprintk("GUID Partition Table Header signature is wrong:"
+                pr_debug("GUID Partition Table Header signature is wrong:"
-                        "%lld != %lld\n",
+                         "%lld != %lld\n",
-                        (unsigned long long)le64_to_cpu((*gpt)->signature),
+                         (unsigned long long)le64_to_cpu((*gpt)->signature),
-                        (unsigned long long)GPT_HEADER_SIGNATURE);
+                         (unsigned long long)GPT_HEADER_SIGNATURE);
                goto fail;
        }
@@ -318,9 +311,8 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
        if (crc != origcrc) {
-                Dprintk
+                pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
-                    ("GUID Partition Table Header CRC is wrong: %x != %x\n",
+                         crc, origcrc);
-                     crc, origcrc);
                goto fail;
        }
        (*gpt)->header_crc32 = cpu_to_le32(origcrc);
@@ -328,9 +320,9 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
        /* Check that the my_lba entry points to the LBA that contains
         * the GUID Partition Table */
        if (le64_to_cpu((*gpt)->my_lba) != lba) {
-                Dprintk("GPT my_lba incorrect: %lld != %lld\n",
+                pr_debug("GPT my_lba incorrect: %lld != %lld\n",
-                        (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+                         (unsigned long long)le64_to_cpu((*gpt)->my_lba),
-                        (unsigned long long)lba);
+                         (unsigned long long)lba);
                goto fail;
        }
@@ -339,15 +331,15 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
         */
        lastlba = last_lba(bdev);
        if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
-                Dprintk("GPT: first_usable_lba incorrect: %lld > %lld\n",
+                pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
-                        (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+                         (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
-                        (unsigned long long)lastlba);
+                         (unsigned long long)lastlba);
                goto fail;
        }
        if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
-                Dprintk("GPT: last_usable_lba incorrect: %lld > %lld\n",
+                pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
-                        (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+                         (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
-                        (unsigned long long)lastlba);
+                         (unsigned long long)lastlba);
                goto fail;
        }
@@ -360,7 +352,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
                        le32_to_cpu((*gpt)->sizeof_partition_entry));
        if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
-                Dprintk("GUID Partitition Entry Array CRC check failed.\n");
+                pr_debug("GUID Partitition Entry Array CRC check failed.\n");
                goto fail_ptes;
        }
@@ -616,7 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
                return 0;
        }
-        Dprintk("GUID Partition Table is valid!  Yea!\n");
+        pr_debug("GUID Partition Table is valid!  Yea!\n");
        for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
                if (!is_pte_valid(&ptes[i], last_lba(bdev)))
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 0fdda2e8a4cc..8652fb99e962 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -133,17 +133,17 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
        bool is_vista = false;
        BUG_ON(!data || !ph);
-        if (MAGIC_PRIVHEAD != BE64(data)) {
+        if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
                ldm_error("Cannot find PRIVHEAD structure. LDM database is"
                        " corrupt. Aborting.");
                return false;
        }
-        ph->ver_major = BE16(data + 0x000C);
+        ph->ver_major = get_unaligned_be16(data + 0x000C);
-        ph->ver_minor = BE16(data + 0x000E);
+        ph->ver_minor = get_unaligned_be16(data + 0x000E);
-        ph->logical_disk_start = BE64(data + 0x011B);
+        ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
-        ph->logical_disk_size = BE64(data + 0x0123);
+        ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
-        ph->config_start = BE64(data + 0x012B);
+        ph->config_start = get_unaligned_be64(data + 0x012B);
-        ph->config_size = BE64(data + 0x0133);
+        ph->config_size = get_unaligned_be64(data + 0x0133);
        /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
        if (ph->ver_major == 2 && ph->ver_minor == 12)
                is_vista = true;
@@ -191,14 +191,14 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 {
        BUG_ON (!data || !toc);
-        if (MAGIC_TOCBLOCK != BE64 (data)) {
+        if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
                ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
                return false;
        }
        strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
        toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
-        toc->bitmap1_start = BE64 (data + 0x2E);
+        toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
-        toc->bitmap1_size  = BE64 (data + 0x36);
+        toc->bitmap1_size  = get_unaligned_be64(data + 0x36);
        if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
                        sizeof (toc->bitmap1_name)) != 0) {
@@ -208,8 +208,8 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
        }
        strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
        toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
-        toc->bitmap2_start = BE64 (data + 0x50);
+        toc->bitmap2_start = get_unaligned_be64(data + 0x50);
-        toc->bitmap2_size  = BE64 (data + 0x58);
+        toc->bitmap2_size  = get_unaligned_be64(data + 0x58);
        if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
                        sizeof (toc->bitmap2_name)) != 0) {
                ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
@@ -237,22 +237,22 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 {
        BUG_ON (!data || !vm);
-        if (MAGIC_VMDB != BE32 (data)) {
+        if (MAGIC_VMDB != get_unaligned_be32(data)) {
                ldm_crit ("Cannot find the VMDB, database may be corrupt.");
                return false;
        }
-        vm->ver_major = BE16 (data + 0x12);
+        vm->ver_major = get_unaligned_be16(data + 0x12);
-        vm->ver_minor = BE16 (data + 0x14);
+        vm->ver_minor = get_unaligned_be16(data + 0x14);
        if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
                ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
                        "Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
                return false;
        }
-        vm->vblk_size     = BE32 (data + 0x08);
+        vm->vblk_size     = get_unaligned_be32(data + 0x08);
-        vm->vblk_offset   = BE32 (data + 0x0C);
+        vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
-        vm->last_vblk_seq = BE32 (data + 0x04);
+        vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
        ldm_debug ("Parsed VMDB successfully.");
        return true;
@@ -507,7 +507,7 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
                goto out;                               /* Already logged */
        /* Are there uncommitted transactions? */
-        if (BE16(data + 0x10) != 0x01) {
+        if (get_unaligned_be16(data + 0x10) != 0x01) {
                ldm_crit ("Database is not in a consistent state.  Aborting.");
                goto out;
        }
@@ -802,7 +802,7 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        len += VBLK_SIZE_CMP3;
-        if (len != BE32 (buffer + 0x14))
+        if (len != get_unaligned_be32(buffer + 0x14))
                return false;
        comp = &vb->vblk.comp;
@@ -851,7 +851,7 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        len += VBLK_SIZE_DGR3;
-        if (len != BE32 (buffer + 0x14))
+        if (len != get_unaligned_be32(buffer + 0x14))
                return false;
        dgrp = &vb->vblk.dgrp;
@@ -895,7 +895,7 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        len += VBLK_SIZE_DGR4;
-        if (len != BE32 (buffer + 0x14))
+        if (len != get_unaligned_be32(buffer + 0x14))
                return false;
        dgrp = &vb->vblk.dgrp;
@@ -931,7 +931,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        len += VBLK_SIZE_DSK3;
-        if (len != BE32 (buffer + 0x14))
+        if (len != get_unaligned_be32(buffer + 0x14))
                return false;
        disk = &vb->vblk.disk;
@@ -968,7 +968,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        len += VBLK_SIZE_DSK4;
-        if (len != BE32 (buffer + 0x14))
+        if (len != get_unaligned_be32(buffer + 0x14))
                return false;
        disk = &vb->vblk.disk;
@@ -1034,14 +1034,14 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        }
        len += VBLK_SIZE_PRT3;
-        if (len > BE32(buffer + 0x14)) {
+        if (len > get_unaligned_be32(buffer + 0x14)) {
                ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-                                BE32(buffer + 0x14));
+                                get_unaligned_be32(buffer + 0x14));
                return false;
        }
        part = &vb->vblk.part;
-        part->start = BE64(buffer + 0x24 + r_name);
+        part->start = get_unaligned_be64(buffer + 0x24 + r_name);
-        part->volume_offset = BE64(buffer + 0x2C + r_name);
+        part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
        part->size = ldm_get_vnum(buffer + 0x34 + r_name);
        part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
        part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
@@ -1139,9 +1139,9 @@ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
                return false;
        }
        len += VBLK_SIZE_VOL5;
-        if (len > BE32(buffer + 0x14)) {
+        if (len > get_unaligned_be32(buffer + 0x14)) {
                ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-                                BE32(buffer + 0x14));
+                                get_unaligned_be32(buffer + 0x14));
                return false;
        }
        volu = &vb->vblk.volu;
@@ -1294,9 +1294,9 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
        BUG_ON (!data || !frags);
-        group = BE32 (data + 0x08);
+        group = get_unaligned_be32(data + 0x08);
-        rec   = BE16 (data + 0x0C);
+        rec   = get_unaligned_be16(data + 0x0C);
-        num   = BE16 (data + 0x0E);
+        num   = get_unaligned_be16(data + 0x0E);
        if ((num < 1) || (num > 4)) {
                ldm_error ("A VBLK claims to have %d parts.", num);
                return false;
@@ -1425,12 +1425,12 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
                }
                for (v = 0; v < perbuf; v++, data+=size) {  /* For each vblk */
-                        if (MAGIC_VBLK != BE32 (data)) {
+                        if (MAGIC_VBLK != get_unaligned_be32(data)) {
                                ldm_error ("Expected to find a VBLK.");
                                goto out;
                        }
-                        recs = BE16 (data + 0x0E);      /* Number of records */
+                        recs = get_unaligned_be16(data + 0x0E); /* Number of records */
                        if (recs == 1) {
                                if (!ldm_ldmdb_add (data, size, ldb))
                                        goto out;       /* Already logged */
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 80f63b5fdd9f..30e08e809c1d 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -98,11 +98,6 @@ struct parsed_partitions;
 #define TOC_BITMAP1             "config"        /* Names of the two defined */
 #define TOC_BITMAP2             "log"           /* bitmaps in the TOCBLOCK. */
-/* Most numbers we deal with are big-endian and won't be aligned. */
-#define BE16(x)                 ((u16)be16_to_cpu(get_unaligned((__be16*)(x))))
-#define BE32(x)                 ((u32)be32_to_cpu(get_unaligned((__be32*)(x))))
-#define BE64(x)                 ((u64)be64_to_cpu(get_unaligned((__be64*)(x))))
 /* Borrowed from msdos.c */
 #define SYS_IND(p)              (get_unaligned(&(p)->sys_ind))
diff --git a/fs/pipe.c b/fs/pipe.c
index 10c4e9aa5c49..fcba6542b8d0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,45 +777,10 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
 /*
 * The file_operations structs are not static because they
 * are also used in linux/fs/fifo.c to do operations on FIFOs.
+ *
+ * Pipes reuse fifos' file_operations structs.
 */
-const struct file_operations read_fifo_fops = {
+const struct file_operations read_pipefifo_fops = {
-        .llseek         = no_llseek,
-        .read           = do_sync_read,
-        .aio_read       = pipe_read,
-        .write          = bad_pipe_w,
-        .poll           = pipe_poll,
-        .unlocked_ioctl = pipe_ioctl,
-        .open           = pipe_read_open,
-        .release        = pipe_read_release,
-        .fasync         = pipe_read_fasync,
-};
-const struct file_operations write_fifo_fops = {
-        .llseek         = no_llseek,
-        .read           = bad_pipe_r,
-        .write          = do_sync_write,
-        .aio_write      = pipe_write,
-        .poll           = pipe_poll,
-        .unlocked_ioctl = pipe_ioctl,
-        .open           = pipe_write_open,
-        .release        = pipe_write_release,
-        .fasync         = pipe_write_fasync,
-};
-const struct file_operations rdwr_fifo_fops = {
-        .llseek         = no_llseek,
-        .read           = do_sync_read,
-        .aio_read       = pipe_read,
-        .write          = do_sync_write,
-        .aio_write      = pipe_write,
-        .poll           = pipe_poll,
-        .unlocked_ioctl = pipe_ioctl,
-        .open           = pipe_rdwr_open,
-        .release        = pipe_rdwr_release,
-        .fasync         = pipe_rdwr_fasync,
-};
-static const struct file_operations read_pipe_fops = {
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = pipe_read,
@@ -827,7 +792,7 @@ static const struct file_operations read_pipe_fops = {
        .fasync         = pipe_read_fasync,
 };
-static const struct file_operations write_pipe_fops = {
+const struct file_operations write_pipefifo_fops = {
        .llseek         = no_llseek,
        .read           = bad_pipe_r,
        .write          = do_sync_write,
@@ -839,7 +804,7 @@ static const struct file_operations write_pipe_fops = {
        .fasync         = pipe_write_fasync,
 };
-static const struct file_operations rdwr_pipe_fops = {
+const struct file_operations rdwr_pipefifo_fops = {
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = pipe_read,
@@ -927,7 +892,7 @@ static struct inode * get_pipe_inode(void)
        inode->i_pipe = pipe;
        pipe->readers = pipe->writers = 1;
-        inode->i_fop = &rdwr_pipe_fops;
+        inode->i_fop = &rdwr_pipefifo_fops;
        /*
         * Mark the inode dirty from the very beginning,
@@ -978,7 +943,7 @@ struct file *create_write_pipe(int flags)
        d_instantiate(dentry, inode);
        err = -ENFILE;
-        f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipe_fops);
+        f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops);
        if (!f)
                goto err_dentry;
        f->f_mapping = inode->i_mapping;
@@ -1020,7 +985,7 @@ struct file *create_read_pipe(struct file *wrf, int flags)
        f->f_pos = 0;
        f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        f->f_op = &read_pipe_fops;
+        f->f_op = &read_pipefifo_fops;
        f->f_mode = FMODE_READ;
        f->f_version = 0;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
new file mode 100644
index 000000000000..73cd7a418f06
--- /dev/null
+++ b/fs/proc/Kconfig
@@ -0,0 +1,59 @@
+config PROC_FS
+        bool "/proc file system support" if EMBEDDED
+        default y
+        help
+          This is a virtual file system providing information about the status
+          of the system. "Virtual" means that it doesn't take up any space on
+          your hard disk: the files are created on the fly by the kernel when
+          you try to access them. Also, you cannot read the files with older
+          version of the program less: you need to use more or cat.
+          It's totally cool; for example, "cat /proc/interrupts" gives
+          information about what the different IRQs are used for at the moment
+          (there is a small number of Interrupt ReQuest lines in your computer
+          that are used by the attached devices to gain the CPU's attention --
+          often a source of trouble if two devices are mistakenly configured
+          to use the same IRQ). The program procinfo to display some
+          information about your system gathered from the /proc file system.
+          Before you can use the /proc file system, it has to be mounted,
+          meaning it has to be given a location in the directory hierarchy.
+          That location should be /proc. A command such as "mount -t proc proc
+          /proc" or the equivalent line in /etc/fstab does the job.
+          The /proc file system is explained in the file
+          <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+          ("man 5 proc").
+          This option will enlarge your kernel by about 67 KB. Several
+          programs depend on this, so everyone should say Y here.
+config PROC_KCORE
+        bool "/proc/kcore support" if !ARM
+        depends on PROC_FS && MMU
+config PROC_VMCORE
+        bool "/proc/vmcore support (EXPERIMENTAL)"
+        depends on PROC_FS && CRASH_DUMP
+        default y
+        help
+        Exports the dump image of crashed kernel in ELF format.
+config PROC_SYSCTL
+        bool "Sysctl support (/proc/sys)" if EMBEDDED
+        depends on PROC_FS
+        select SYSCTL
+        default y
+        ---help---
+          The sysctl interface provides a means of dynamically changing
+          certain kernel parameters and variables on the fly without requiring
+          a recompile of the kernel or reboot of the system.  The primary
+          interface is through /proc/sys.  If you say Y here a tree of
+          modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+          in <file:Documentation/sysctl/>.  Note that enabling this
+          option will enlarge the kernel by at least 8 KB.
+          As it is generally a good thing, you should say Y here unless
+          building a kernel for install/rescue disks or your system is very
+          limited in memory.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 797d775e0354..71c9be59c9c2 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,6 +80,7 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/tracehook.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -168,8 +169,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
        rcu_read_lock();
        ppid = pid_alive(p) ?
                task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-        tpid = pid_alive(p) && p->ptrace ?
+        tpid = 0;
-                task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+        if (pid_alive(p)) {
+                struct task_struct *tracer = tracehook_tracer_task(p);
+                if (tracer)
+                        tpid = task_pid_nr_ns(tracer, ns);
+        }
        seq_printf(m,
                "State:\t%s\n"
                "Tgid:\t%d\n"
@@ -332,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        return 0;
 }
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
-        return p->utime;
-}
-static cputime_t task_stime(struct task_struct *p)
-{
-        return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
-        clock_t utime = cputime_to_clock_t(p->utime),
-                total = utime + cputime_to_clock_t(p->stime);
-        u64 temp;
-        /*
-         * Use CFS's precise accounting:
-         */
-        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-        if (total) {
-                temp *= utime;
-                do_div(temp, total);
-        }
-        utime = (clock_t)temp;
-        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-        return p->prev_utime;
-}
-static cputime_t task_stime(struct task_struct *p)
-{
-        clock_t stime;
-        /*
-         * Use CFS's precise accounting. (we subtract utime from
-         * the total, to make sure the total observed by userspace
-         * grows monotonically - apps rely on that):
-         */
-        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-                        cputime_to_clock_t(task_utime(p));
-        if (stime >= 0)
-                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-        return p->prev_stime;
-}
-#endif
-static cputime_t task_gtime(struct task_struct *p)
-{
-        return p->gtime;
-}
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task, int whole)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58c3e6a8e15e..a28840b11b89 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -53,6 +53,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/file.h>
@@ -69,6 +70,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -231,10 +233,14 @@ static int check_mem_permission(struct task_struct *task)
         * If current is actively ptrace'ing, and would also be
         * permitted to freshly attach with ptrace now, permit it.
         */
-        if (task->parent == current && (task->ptrace & PT_PTRACED) &&
+        if (task_is_stopped_or_traced(task)) {
-            task_is_stopped_or_traced(task) &&
+                int match;
-            ptrace_may_access(task, PTRACE_MODE_ATTACH))
+                rcu_read_lock();
-                return 0;
+                match = (tracehook_tracer_task(task) == current);
+                rcu_read_unlock();
+                if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
+                        return 0;
+        }
        /*
         * Noone else is allowed.
@@ -504,6 +510,26 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
        return count;
 }
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct task_struct *task, char *buffer)
+{
+        long nr;
+        unsigned long args[6], sp, pc;
+        if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+                return sprintf(buffer, "running\n");
+        if (nr < 0)
+                return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+        return sprintf(buffer,
+                       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+                       nr,
+                       args[0], args[1], args[2], args[3], args[4], args[5],
+                       sp, pc);
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -1834,8 +1860,7 @@ static const struct file_operations proc_fd_operations = {
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
 */
-static int proc_fd_permission(struct inode *inode, int mask,
+static int proc_fd_permission(struct inode *inode, int mask)
-                                struct nameidata *nd)
 {
        int rv;
@@ -2376,29 +2401,47 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 }
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
+        struct task_io_accounting acct = task->ioac;
+        unsigned long flags;
+        if (whole && lock_task_sighand(task, &flags)) {
+                struct task_struct *t = task;
+                task_io_accounting_add(&acct, &task->signal->ioac);
+                while_each_thread(task, t)
+                        task_io_accounting_add(&acct, &t->ioac);
+                unlock_task_sighand(task, &flags);
+        }
        return sprintf(buffer,
-#ifdef CONFIG_TASK_XACCT
                        "rchar: %llu\n"
                        "wchar: %llu\n"
                        "syscr: %llu\n"
                        "syscw: %llu\n"
-#endif
                        "read_bytes: %llu\n"
                        "write_bytes: %llu\n"
                        "cancelled_write_bytes: %llu\n",
-#ifdef CONFIG_TASK_XACCT
+                        (unsigned long long)acct.rchar,
-                        (unsigned long long)task->rchar,
+                        (unsigned long long)acct.wchar,
-                        (unsigned long long)task->wchar,
+                        (unsigned long long)acct.syscr,
-                        (unsigned long long)task->syscr,
+                        (unsigned long long)acct.syscw,
-                        (unsigned long long)task->syscw,
+                        (unsigned long long)acct.read_bytes,
-#endif
+                        (unsigned long long)acct.write_bytes,
-                        (unsigned long long)task->ioac.read_bytes,
+                        (unsigned long long)acct.cancelled_write_bytes);
-                        (unsigned long long)task->ioac.write_bytes,
+}
-                        (unsigned long long)task->ioac.cancelled_write_bytes);
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+        return do_io_accounting(task, buffer, 0);
 }
-#endif
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+        return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 /*
 * Thread groups
@@ -2420,6 +2463,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
 #endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        INF("syscall",    S_IRUSR, pid_syscall),
+#endif
        INF("cmdline",    S_IRUGO, pid_cmdline),
        ONE("stat",       S_IRUGO, tgid_stat),
        ONE("statm",      S_IRUGO, pid_statm),
@@ -2470,7 +2516,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, pid_io_accounting),
+        INF("io",       S_IRUGO, tgid_io_accounting),
 #endif
 };
@@ -2752,6 +2798,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
 #endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        INF("syscall",   S_IRUSR, pid_syscall),
+#endif
        INF("cmdline",   S_IRUGO, pid_cmdline),
        ONE("stat",      S_IRUGO, tid_stat),
        ONE("statm",     S_IRUGO, pid_statm),
@@ -2797,6 +2846,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+        INF("io",       S_IRUGO, tid_io_accounting),
+#endif
 };
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43e54e86cefd..7821589a17d5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -300,10 +300,10 @@ out:
        return rtn;
 }
-static DEFINE_IDR(proc_inum_idr);
+static DEFINE_IDA(proc_inum_ida);
 static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
-#define PROC_DYNAMIC_FIRST 0xF0000000UL
+#define PROC_DYNAMIC_FIRST 0xF0000000U
 /*
 * Return an inode number between PROC_DYNAMIC_FIRST and
@@ -311,36 +311,34 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 */
 static unsigned int get_inode_number(void)
 {
-        int i, inum = 0;
+        unsigned int i;
        int error;
 retry:
-        if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
+        if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
                return 0;
        spin_lock(&proc_inum_lock);
-        error = idr_get_new(&proc_inum_idr, NULL, &i);
+        error = ida_get_new(&proc_inum_ida, &i);
        spin_unlock(&proc_inum_lock);
        if (error == -EAGAIN)
                goto retry;
        else if (error)
                return 0;
-        inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;
+        if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
+                spin_lock(&proc_inum_lock);
-        /* inum will never be more than 0xf0ffffff, so no check
+                ida_remove(&proc_inum_ida, i);
-         * for overflow.
+                spin_unlock(&proc_inum_lock);
-         */
+                return 0;
+        }
-        return inum;
+        return PROC_DYNAMIC_FIRST + i;
 }
 static void release_inode_number(unsigned int inum)
 {
-        int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK;
        spin_lock(&proc_inum_lock);
-        idr_remove(&proc_inum_idr, id);
+        ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
        spin_unlock(&proc_inum_lock);
 }
@@ -549,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        for (tmp = dir->subdir; tmp; tmp = tmp->next)
                if (strcmp(tmp->name, dp->name) == 0) {
-                        printk(KERN_WARNING "proc_dir_entry '%s' already "
+                        printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
-                                        "registered\n", dp->name);
+                                dir->name, dp->name);
                        dump_stack();
                        break;
                }
@@ -597,6 +595,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
        ent->pde_users = 0;
        spin_lock_init(&ent->pde_unload_lock);
        ent->pde_unload_completion = NULL;
+        INIT_LIST_HEAD(&ent->pde_openers);
 out:
        return ent;
 }
@@ -789,15 +788,25 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        spin_unlock(&de->pde_unload_lock);
 continue_removing:
+        spin_lock(&de->pde_unload_lock);
+        while (!list_empty(&de->pde_openers)) {
+                struct pde_opener *pdeo;
+                pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+                list_del(&pdeo->lh);
+                spin_unlock(&de->pde_unload_lock);
+                pdeo->release(pdeo->inode, pdeo->file);
+                kfree(pdeo);
+                spin_lock(&de->pde_unload_lock);
+        }
+        spin_unlock(&de->pde_unload_lock);
        if (S_ISDIR(de->mode))
                parent->nlink--;
        de->nlink = 0;
-        if (de->subdir) {
+        WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
-                printk(KERN_WARNING "%s: removing non-empty directory "
                        "'%s/%s', leaking at least '%s'\n", __func__,
                        de->parent->name, de->name, de->subdir->name);
-                WARN_ON(1);
-        }
        if (atomic_dec_and_test(&de->count))
                free_proc_entry(de);
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b08d10017911..8bb03f056c28 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/sysctl.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -65,6 +66,8 @@ static void proc_delete_inode(struct inode *inode)
                        module_put(de->owner);
                de_put(de);
        }
+        if (PROC_I(inode)->sysctl)
+                sysctl_head_put(PROC_I(inode)->sysctl);
        clear_inode(inode);
 }
@@ -84,6 +87,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
+        ei->sysctl = NULL;
+        ei->sysctl_entry = NULL;
        inode = &ei->vfs_inode;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        return inode;
@@ -94,7 +99,7 @@ static void proc_destroy_inode(struct inode *inode)
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct proc_inode *ei = (struct proc_inode *) foo;
@@ -111,27 +116,25 @@ int __init proc_init_inodecache(void)
        return 0;
 }
-static int proc_remount(struct super_block *sb, int *flags, char *data)
-{
-        *flags |= MS_NODIRATIME;
-        return 0;
-}
 static const struct super_operations proc_sops = {
        .alloc_inode    = proc_alloc_inode,
        .destroy_inode  = proc_destroy_inode,
        .drop_inode     = generic_delete_inode,
        .delete_inode   = proc_delete_inode,
        .statfs         = simple_statfs,
-        .remount_fs     = proc_remount,
 };
-static void pde_users_dec(struct proc_dir_entry *pde)
+static void __pde_users_dec(struct proc_dir_entry *pde)
 {
-        spin_lock(&pde->pde_unload_lock);
        pde->pde_users--;
        if (pde->pde_unload_completion && pde->pde_users == 0)
                complete(pde->pde_unload_completion);
+}
+static void pde_users_dec(struct proc_dir_entry *pde)
+{
+        spin_lock(&pde->pde_unload_lock);
+        __pde_users_dec(pde);
        spin_unlock(&pde->pde_unload_lock);
 }
@@ -318,36 +321,97 @@ static int proc_reg_open(struct inode *inode, struct file *file)
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        int (*open)(struct inode *, struct file *);
+        int (*release)(struct inode *, struct file *);
+        struct pde_opener *pdeo;
+        /*
+         * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
+         * sequence. ->release won't be called because ->proc_fops will be
+         * cleared. Depending on complexity of ->release, consequences vary.
+         *
+         * We can't wait for mercy when close will be done for real, it's
+         * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
+         * by hand in remove_proc_entry(). For this, save opener's credentials
+         * for later.
+         */
+        pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+        if (!pdeo)
+                return -ENOMEM;
        spin_lock(&pde->pde_unload_lock);
        if (!pde->proc_fops) {
                spin_unlock(&pde->pde_unload_lock);
+                kfree(pdeo);
                return rv;
        }
        pde->pde_users++;
        open = pde->proc_fops->open;
+        release = pde->proc_fops->release;
        spin_unlock(&pde->pde_unload_lock);
        if (open)
                rv = open(inode, file);
-        pde_users_dec(pde);
+        spin_lock(&pde->pde_unload_lock);
+        if (rv == 0 && release) {
+                /* To know what to release. */
+                pdeo->inode = inode;
+                pdeo->file = file;
+                /* Strictly for "too late" ->release in proc_reg_release(). */
+                pdeo->release = release;
+                list_add(&pdeo->lh, &pde->pde_openers);
+        } else
+                kfree(pdeo);
+        __pde_users_dec(pde);
+        spin_unlock(&pde->pde_unload_lock);
        return rv;
 }
+static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
+                                        struct inode *inode, struct file *file)
+{
+        struct pde_opener *pdeo;
+        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+                if (pdeo->inode == inode && pdeo->file == file)
+                        return pdeo;
+        }
+        return NULL;
+}
 static int proc_reg_release(struct inode *inode, struct file *file)
 {
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        int (*release)(struct inode *, struct file *);
+        struct pde_opener *pdeo;
        spin_lock(&pde->pde_unload_lock);
+        pdeo = find_pde_opener(pde, inode, file);
        if (!pde->proc_fops) {
-                spin_unlock(&pde->pde_unload_lock);
+                /*
+                 * Can't simply exit, __fput() will think that everything is OK,
+                 * and move on to freeing struct file. remove_proc_entry() will
+                 * find slacker in opener's list and will try to do non-trivial
+                 * things with struct file. Therefore, remove opener from list.
+                 *
+                 * But if opener is removed from list, who will ->release it?
+                 */
+                if (pdeo) {
+                        list_del(&pdeo->lh);
+                        spin_unlock(&pde->pde_unload_lock);
+                        rv = pdeo->release(inode, file);
+                        kfree(pdeo);
+                } else
+                        spin_unlock(&pde->pde_unload_lock);
                return rv;
        }
        pde->pde_users++;
        release = pde->proc_fops->release;
+        if (pdeo) {
+                list_del(&pdeo->lh);
+                kfree(pdeo);
+        }
        spin_unlock(&pde->pde_unload_lock);
        if (release)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 28cbca805905..442202314d53 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -63,6 +63,7 @@ extern const struct file_operations proc_smaps_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
+extern const struct file_operations proc_kmsg_operations;
 extern const struct inode_operations proc_net_inode_operations;
 void free_proc_entry(struct proc_dir_entry *de);
@@ -88,3 +89,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
                struct dentry *dentry);
 int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
                filldir_t filldir);
+struct pde_opener {
+        struct inode *inode;
+        struct file *file;
+        int (*release)(struct inode *, struct file *);
+        struct list_head lh;
+};
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e78c81fcf547..c2370c76fb71 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,6 +23,10 @@
 #define CORE_STR "CORE"
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS 0
+#endif
 static int open_kcore(struct inode * inode, struct file * filp)
 {
        return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
@@ -164,11 +168,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
        elf->e_entry    = 0;
        elf->e_phoff    = sizeof(struct elfhdr);
        elf->e_shoff    = 0;
-#if defined(CONFIG_H8300)
+        elf->e_flags    = ELF_CORE_EFLAGS;
-        elf->e_flags    = ELF_FLAGS;
-#else
-        elf->e_flags    = 0;
-#endif
        elf->e_ehsize   = sizeof(struct elfhdr);
        elf->e_phentsize= sizeof(struct elf_phdr);
        elf->e_phnum    = nphdr;
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index ff3b90b56e9d..9fd5df3f40ce 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,6 +15,8 @@
 #include <asm/uaccess.h>
 #include <asm/io.h>
+#include "internal.h"
 extern wait_queue_head_t log_wait;
 extern int do_syslog(int type, char __user *bug, int count);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 79ecd281d2cb..3f87d2632947 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
        }
        seq_printf(m,
-                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                   vma->vm_start,
                   vma->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   vma->vm_pgoff << PAGE_SHIFT,
+                   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ded969862960..29e20c6b1f7f 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/string.h>
 #include <linux/mman.h>
+#include <linux/quicklist.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
 #include <linux/mm.h>
@@ -182,6 +183,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                "SReclaimable: %8lu kB\n"
                "SUnreclaim:   %8lu kB\n"
                "PageTables:   %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+                "Quicklists:   %8lu kB\n"
+#endif
                "NFS_Unstable: %8lu kB\n"
                "Bounce:       %8lu kB\n"
                "WritebackTmp: %8lu kB\n"
@@ -214,6 +218,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                K(global_page_state(NR_SLAB_RECLAIMABLE)),
                K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+                K(quicklist_total_size()),
+#endif
                K(global_page_state(NR_UNSTABLE_NFS)),
                K(global_page_state(NR_BOUNCE)),
                K(global_page_state(NR_WRITEBACK_TEMP)),
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5acc001d49f6..f9a8b892718f 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -10,149 +10,110 @@
 static struct dentry_operations proc_sys_dentry_operations;
 static const struct file_operations proc_sys_file_operations;
 static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
-static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table)
+static struct inode *proc_sys_make_inode(struct super_block *sb,
-{
+                struct ctl_table_header *head, struct ctl_table *table)
-        /* Refresh the cached information bits in the inode */
-        if (table) {
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_mode = table->mode;
-                if (table->proc_handler) {
-                        inode->i_mode |= S_IFREG;
-                        inode->i_nlink = 1;
-                } else {
-                        inode->i_mode |= S_IFDIR;
-                        inode->i_nlink = 0;     /* It is too hard to figure out */
-                }
-        }
-}
-static struct inode *proc_sys_make_inode(struct inode *dir, struct ctl_table *table)
 {
        struct inode *inode;
-        struct proc_inode *dir_ei, *ei;
+        struct proc_inode *ei;
-        int depth;
-        inode = new_inode(dir->i_sb);
+        inode = new_inode(sb);
        if (!inode)
                goto out;
-        /* A directory is always one deeper than it's parent */
+        sysctl_head_get(head);
-        dir_ei = PROC_I(dir);
-        depth = dir_ei->fd + 1;
        ei = PROC_I(inode);
-        ei->fd = depth;
+        ei->sysctl = head;
+        ei->sysctl_entry = table;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_op = &proc_sys_inode_operations;
-        inode->i_fop = &proc_sys_file_operations;
        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
-        proc_sys_refresh_inode(inode, table);
+        inode->i_mode = table->mode;
+        if (!table->child) {
+                inode->i_mode |= S_IFREG;
+                inode->i_op = &proc_sys_inode_operations;
+                inode->i_fop = &proc_sys_file_operations;
+        } else {
+                inode->i_mode |= S_IFDIR;
+                inode->i_nlink = 0;
+                inode->i_op = &proc_sys_dir_operations;
+                inode->i_fop = &proc_sys_dir_file_operations;
+        }
 out:
        return inode;
 }
-static struct dentry *proc_sys_ancestor(struct dentry *dentry, int depth)
+static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
-{
-        for (;;) {
-                struct proc_inode *ei;
-                ei = PROC_I(dentry->d_inode);
-                if (ei->fd == depth)
-                        break; /* found */
-                dentry = dentry->d_parent;
-        }
-        return dentry;
-}
-static struct ctl_table *proc_sys_lookup_table_one(struct ctl_table *table,
-                                                        struct qstr *name)
 {
        int len;
-        for ( ; table->ctl_name || table->procname; table++) {
+        for ( ; p->ctl_name || p->procname; p++) {
-                if (!table->procname)
+                if (!p->procname)
                        continue;
-                len = strlen(table->procname);
+                len = strlen(p->procname);
                if (len != name->len)
                        continue;
-                if (memcmp(table->procname, name->name, len) != 0)
+                if (memcmp(p->procname, name->name, len) != 0)
                        continue;
                /* I have a match */
-                return table;
+                return p;
        }
        return NULL;
 }
-static struct ctl_table *proc_sys_lookup_table(struct dentry *dentry,
+struct ctl_table_header *grab_header(struct inode *inode)
-                                                struct ctl_table *table)
 {
-        struct dentry *ancestor;
+        if (PROC_I(inode)->sysctl)
-        struct proc_inode *ei;
+                return sysctl_head_grab(PROC_I(inode)->sysctl);
-        int depth, i;
+        else
+                return sysctl_head_next(NULL);
+}
-        ei = PROC_I(dentry->d_inode);
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
-        depth = ei->fd;
+                                        struct nameidata *nd)
+{
+        struct ctl_table_header *head = grab_header(dir);
+        struct ctl_table *table = PROC_I(dir)->sysctl_entry;
+        struct ctl_table_header *h = NULL;
+        struct qstr *name = &dentry->d_name;
+        struct ctl_table *p;
+        struct inode *inode;
+        struct dentry *err = ERR_PTR(-ENOENT);
-        if (depth == 0)
+        if (IS_ERR(head))
-                return table;
+                return ERR_CAST(head);
-        for (i = 1; table && (i <= depth); i++) {
+        if (table && !table->child) {
-                ancestor = proc_sys_ancestor(dentry, i);
+                WARN_ON(1);
-                table = proc_sys_lookup_table_one(table, &ancestor->d_name);
+                goto out;
-                if (table)
-                        table = table->child;
        }
-        return table;
-}
-static struct ctl_table *proc_sys_lookup_entry(struct dentry *dparent,
-                                                struct qstr *name,
-                                                struct ctl_table *table)
-{
-        table = proc_sys_lookup_table(dparent, table);
-        if (table)
-                table = proc_sys_lookup_table_one(table, name);
-        return table;
-}
-static struct ctl_table *do_proc_sys_lookup(struct dentry *parent,
+        table = table ? table->child : head->ctl_table;
-                                                struct qstr *name,
-                                                struct ctl_table_header **ptr)
-{
-        struct ctl_table_header *head;
-        struct ctl_table *table = NULL;
-        for (head = sysctl_head_next(NULL); head;
+        p = find_in_table(table, name);
-                        head = sysctl_head_next(head)) {
+        if (!p) {
-                table = proc_sys_lookup_entry(parent, name, head->ctl_table);
+                for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
-                if (table)
+                        if (h->attached_to != table)
-                        break;
+                                continue;
+                        p = find_in_table(h->attached_by, name);
+                        if (p)
+                                break;
+                }
        }
-        *ptr = head;
-        return table;
-}
-static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
-                                        struct nameidata *nd)
-{
-        struct ctl_table_header *head;
-        struct inode *inode;
-        struct dentry *err;
-        struct ctl_table *table;
-        err = ERR_PTR(-ENOENT);
+        if (!p)
-        table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-        if (!table)
                goto out;
        err = ERR_PTR(-ENOMEM);
-        inode = proc_sys_make_inode(dir, table);
+        inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+        if (h)
+                sysctl_head_finish(h);
        if (!inode)
                goto out;
@@ -168,22 +129,14 @@ out:
 static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
                size_t count, loff_t *ppos, int write)
 {
-        struct dentry *dentry = filp->f_dentry;
+        struct inode *inode = filp->f_path.dentry->d_inode;
-        struct ctl_table_header *head;
+        struct ctl_table_header *head = grab_header(inode);
-        struct ctl_table *table;
+        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
        ssize_t error;
        size_t res;
-        table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
+        if (IS_ERR(head))
-        /* Has the sysctl entry disappeared on us? */
+                return PTR_ERR(head);
-        error = -ENOENT;
-        if (!table)
-                goto out;
-        /* Has the sysctl entry been replaced by a directory? */
-        error = -EISDIR;
-        if (!table->proc_handler)
-                goto out;
        /*
         * At this point we know that the sysctl was not unregistered
@@ -193,6 +146,11 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
        if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
                goto out;
+        /* if that can happen at all, it should be -EINVAL, not -EISDIR */
+        error = -EINVAL;
+        if (!table->proc_handler)
+                goto out;
        /* careful: calling conventions are nasty here */
        res = count;
        error = table->proc_handler(table, write, filp, buf, &res, ppos);
@@ -218,82 +176,86 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
-                                filldir_t filldir, struct ctl_table *table)
+                                filldir_t filldir,
+                                struct ctl_table_header *head,
+                                struct ctl_table *table)
 {
-        struct ctl_table_header *head;
-        struct ctl_table *child_table = NULL;
        struct dentry *child, *dir = filp->f_path.dentry;
        struct inode *inode;
        struct qstr qname;
        ino_t ino = 0;
        unsigned type = DT_UNKNOWN;
-        int ret;
        qname.name = table->procname;
        qname.len  = strlen(table->procname);
        qname.hash = full_name_hash(qname.name, qname.len);
-        /* Suppress duplicates.
-         * Only fill a directory entry if it is the value that
-         * an ordinary lookup of that name returns.  Hide all
-         * others.
-         *
-         * If we ever cache this translation in the dcache
-         * I should do a dcache lookup first.  But for now
-         * it is just simpler not to.
-         */
-        ret = 0;
-        child_table = do_proc_sys_lookup(dir, &qname, &head);
-        sysctl_head_finish(head);
-        if (child_table != table)
-                return 0;
        child = d_lookup(dir, &qname);
        if (!child) {
-                struct dentry *new;
+                child = d_alloc(dir, &qname);
-                new = d_alloc(dir, &qname);
+                if (child) {
-                if (new) {
+                        inode = proc_sys_make_inode(dir->d_sb, head, table);
-                        inode = proc_sys_make_inode(dir->d_inode, table);
+                        if (!inode) {
-                        if (!inode)
+                                dput(child);
-                                child = ERR_PTR(-ENOMEM);
+                                return -ENOMEM;
-                        else {
+                        } else {
-                                new->d_op = &proc_sys_dentry_operations;
+                                child->d_op = &proc_sys_dentry_operations;
-                                d_add(new, inode);
+                                d_add(child, inode);
                        }
-                        if (child)
+                } else {
-                                dput(new);
+                        return -ENOMEM;
-                        else
-                                child = new;
                }
        }
-        if (!child || IS_ERR(child) || !child->d_inode)
-                goto end_instantiate;
        inode = child->d_inode;
-        if (inode) {
+        ino  = inode->i_ino;
-                ino  = inode->i_ino;
+        type = inode->i_mode >> 12;
-                type = inode->i_mode >> 12;
-        }
        dput(child);
-end_instantiate:
+        return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
-        if (!ino)
+}
-                ino= find_inode_number(dir, &qname);
-        if (!ino)
+static int scan(struct ctl_table_header *head, ctl_table *table,
-                ino = 1;
+                unsigned long *pos, struct file *file,
-        return filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+                void *dirent, filldir_t filldir)
+{
+        for (; table->ctl_name || table->procname; table++, (*pos)++) {
+                int res;
+                /* Can't do anything without a proc name */
+                if (!table->procname)
+                        continue;
+                if (*pos < file->f_pos)
+                        continue;
+                res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+                if (res)
+                        return res;
+                file->f_pos = *pos + 1;
+        }
+        return 0;
 }
 static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
-        struct dentry *dentry = filp->f_dentry;
+        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
-        struct ctl_table_header *head = NULL;
+        struct ctl_table_header *head = grab_header(inode);
-        struct ctl_table *table;
+        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+        struct ctl_table_header *h = NULL;
        unsigned long pos;
-        int ret;
+        int ret = -EINVAL;
+        if (IS_ERR(head))
+                return PTR_ERR(head);
-        ret = -ENOTDIR;
+        if (table && !table->child) {
-        if (!S_ISDIR(inode->i_mode))
+                WARN_ON(1);
                goto out;
+        }
+        table = table ? table->child : head->ctl_table;
        ret = 0;
        /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -311,30 +273,17 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
        }
        pos = 2;
-        /* - Find each instance of the directory
+        ret = scan(head, table, &pos, filp, dirent, filldir);
-         * - Read all entries in each instance
+        if (ret)
-         * - Before returning an entry to user space lookup the entry
+                goto out;
-         *   by name and if I find a different entry don't return
-         *   this one because it means it is a buried dup.
-         * For sysctl this should only happen for directory entries.
-         */
-        for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) {
-                table = proc_sys_lookup_table(dentry, head->ctl_table);
-                if (!table)
+        for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+                if (h->attached_to != table)
                        continue;
+                ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
-                for (; table->ctl_name || table->procname; table++, pos++) {
+                if (ret) {
-                        /* Can't do anything without a proc name */
+                        sysctl_head_finish(h);
-                        if (!table->procname)
+                        break;
-                                continue;
-                        if (pos < filp->f_pos)
-                                continue;
-                        if (proc_sys_fill_cache(filp, dirent, filldir, table) < 0)
-                                goto out;
-                        filp->f_pos = pos + 1;
                }
        }
        ret = 1;
@@ -343,53 +292,24 @@ out:
        return ret;
 }
-static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int proc_sys_permission(struct inode *inode, int mask)
 {
        /*
         * sysctl entries that are not writeable,
         * are _NOT_ writeable, capabilities or not.
         */
-        struct ctl_table_header *head;
+        struct ctl_table_header *head = grab_header(inode);
-        struct ctl_table *table;
+        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
-        struct dentry *dentry;
-        int mode;
-        int depth;
        int error;
-        head = NULL;
+        if (IS_ERR(head))
-        depth = PROC_I(inode)->fd;
+                return PTR_ERR(head);
-        /* First check the cached permissions, in case we don't have
-         * enough information to lookup the sysctl table entry.
-         */
-        error = -EACCES;
-        mode = inode->i_mode;
-        if (current->euid == 0)
-                mode >>= 6;
-        else if (in_group_p(0))
-                mode >>= 3;
-        if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)
-                error = 0;
-        /* If we can't get a sysctl table entry the permission
-         * checks on the cached mode will have to be enough.
-         */
-        if (!nd || !depth)
-                goto out;
-        dentry = nd->path.dentry;
+        if (!table) /* global root - r-xr-xr-x */
-        table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
+                error = mask & MAY_WRITE ? -EACCES : 0;
+        else /* Use the permissions on the sysctl table entry */
+                error = sysctl_perm(head->root, table, mask);
-        /* If the entry does not exist deny permission */
-        error = -EACCES;
-        if (!table)
-                goto out;
-        /* Use the permissions on the sysctl table entry */
-        error = sysctl_perm(head->root, table, mask);
-out:
        sysctl_head_finish(head);
        return error;
 }
@@ -409,33 +329,70 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
        return error;
 }
-/* I'm lazy and don't distinguish between files and directories,
+static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
- * until access time.
+{
- */
+        struct inode *inode = dentry->d_inode;
+        struct ctl_table_header *head = grab_header(inode);
+        struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+        if (IS_ERR(head))
+                return PTR_ERR(head);
+        generic_fillattr(inode, stat);
+        if (table)
+                stat->mode = (stat->mode & S_IFMT) | table->mode;
+        sysctl_head_finish(head);
+        return 0;
+}
 static const struct file_operations proc_sys_file_operations = {
        .read           = proc_sys_read,
        .write          = proc_sys_write,
+};
+static const struct file_operations proc_sys_dir_file_operations = {
        .readdir        = proc_sys_readdir,
 };
 static const struct inode_operations proc_sys_inode_operations = {
+        .permission     = proc_sys_permission,
+        .setattr        = proc_sys_setattr,
+        .getattr        = proc_sys_getattr,
+};
+static const struct inode_operations proc_sys_dir_operations = {
        .lookup         = proc_sys_lookup,
        .permission     = proc_sys_permission,
        .setattr        = proc_sys_setattr,
+        .getattr        = proc_sys_getattr,
 };
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        struct ctl_table_header *head;
+        return !PROC_I(dentry->d_inode)->sysctl->unregistering;
-        struct ctl_table *table;
+}
-        table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-        proc_sys_refresh_inode(dentry->d_inode, table);
+static int proc_sys_delete(struct dentry *dentry)
-        sysctl_head_finish(head);
+{
-        return !!table;
+        return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+                            struct qstr *name)
+{
+        struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+        if (qstr->len != name->len)
+                return 1;
+        if (memcmp(qstr->name, name->name, name->len))
+                return 1;
+        return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
 }
 static struct dentry_operations proc_sys_dentry_operations = {
        .d_revalidate   = proc_sys_revalidate,
+        .d_delete       = proc_sys_delete,
+        .d_compare      = proc_sys_compare,
 };
 static struct proc_dir_entry *proc_sys_root;
@@ -443,8 +400,8 @@ static struct proc_dir_entry *proc_sys_root;
 int proc_sys_init(void)
 {
        proc_sys_root = proc_mkdir("sys", NULL);
-        proc_sys_root->proc_iops = &proc_sys_inode_operations;
+        proc_sys_root->proc_iops = &proc_sys_dir_operations;
-        proc_sys_root->proc_fops = &proc_sys_file_operations;
+        proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
        proc_sys_root->nlink = 0;
        return 0;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7546a918f790..73d1891ee625 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -219,14 +219,14 @@ static int show_map(struct seq_file *m, void *v)
                ino = inode->i_ino;
        }
-        seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+        seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
                        vma->vm_start,
                        vma->vm_end,
                        flags & VM_READ ? 'r' : '-',
                        flags & VM_WRITE ? 'w' : '-',
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
-                        vma->vm_pgoff << PAGE_SHIFT,
+                        ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
                        MAJOR(dev), MINOR(dev), ino, &len);
        /*
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b3..2aad1044b84c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
        kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/quota.c b/fs/quota.c
index db1cc9f3c7aa..7f4386ebc23a 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -186,7 +186,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 void sync_dquots(struct super_block *sb, int type)
 {
-        int cnt, dirty;
+        int cnt;
        if (sb) {
                if (sb->s_qcop->quota_sync)
@@ -198,11 +198,17 @@ void sync_dquots(struct super_block *sb, int type)
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
                /* This test just improves performance so it needn't be reliable... */
-                for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++)
+                for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                        if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt)
+                        if (type != -1 && type != cnt)
-                            && info_any_dirty(&sb_dqopt(sb)->info[cnt]))
+                                continue;
-                                dirty = 1;
+                        if (!sb_has_quota_enabled(sb, cnt))
-                if (!dirty)
+                                continue;
+                        if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
+                            list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
+                                continue;
+                        break;
+                }
+                if (cnt == MAXQUOTAS)
                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index a6cf9269105c..5ae15b13eeb0 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -1,6 +1,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/quota.h>
+#include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
 #include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 234ada903633..b53827dc02d9 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/quotaops.h>
 #include <asm/byteorder.h>
diff --git a/fs/readdir.c b/fs/readdir.c
index 4e026e5407fb..93a7559bbfd8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset
        if (buf->result)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->result = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        buf->result++;
        dirent = buf->dirent;
        if (!access_ok(VERIFY_WRITE, dirent,
@@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
        if (reclen > buf->count)
                return -EINVAL;
        d_ino = ino;
-        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+                buf->error = -EOVERFLOW;
                return -EOVERFLOW;
+        }
        dirent = buf->previous;
        if (dirent) {
                if (__put_user(offset, &dirent->d_off))
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 192269698a8a..5699171212ae 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2435,7 +2435,7 @@ static int reiserfs_write_full_page(struct page *page,
                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
                        lock_buffer(bh);
                } else {
-                        if (test_set_buffer_locked(bh)) {
+                        if (!trylock_buffer(bh)) {
                                redirty_page_for_writepage(wbc, page);
                                continue;
                        }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e396b2fa4743..c21df71943a6 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -34,15 +34,10 @@
 **                      from within kupdate, it will ignore the immediate flag
 */
-#include <asm/uaccess.h>
-#include <asm/system.h>
 #include <linux/time.h>
 #include <linux/semaphore.h>
 #include <linux/vmalloc.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/fcntl.h>
@@ -54,6 +49,9 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/uaccess.h>
+#include <asm/system.h>
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -558,13 +556,13 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *p_s_sb)
 {
        PROC_INFO_INC(p_s_sb, journal.lock_journal);
-        down(&SB_JOURNAL(p_s_sb)->j_lock);
+        mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex);
 }
 /* unlock the current transaction */
 static inline void unlock_journal(struct super_block *p_s_sb)
 {
-        up(&SB_JOURNAL(p_s_sb)->j_lock);
+        mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex);
 }
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -629,7 +627,7 @@ static int journal_list_still_alive(struct super_block *s,
 static void release_buffer_page(struct buffer_head *bh)
 {
        struct page *page = bh->b_page;
-        if (!page->mapping && !TestSetPageLocked(page)) {
+        if (!page->mapping && trylock_page(page)) {
                page_cache_get(page);
                put_bh(bh);
                if (!page->mapping)
@@ -857,7 +855,7 @@ static int write_ordered_buffers(spinlock_t * lock,
                jh = JH_ENTRY(list->next);
                bh = jh->bh;
                get_bh(bh);
-                if (test_set_buffer_locked(bh)) {
+                if (!trylock_buffer(bh)) {
                        if (!buffer_dirty(bh)) {
                                list_move(&jh->list, &tmp);
                                goto loop_next;
@@ -1045,9 +1043,9 @@ static int flush_commit_list(struct super_block *s,
        }
        /* make sure nobody is trying to flush this one at the same time */
-        down(&jl->j_commit_lock);
+        mutex_lock(&jl->j_commit_mutex);
        if (!journal_list_still_alive(s, trans_id)) {
-                up(&jl->j_commit_lock);
+                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
        }
        BUG_ON(jl->j_trans_id == 0);
@@ -1057,7 +1055,7 @@ static int flush_commit_list(struct super_block *s,
                if (flushall) {
                        atomic_set(&(jl->j_older_commits_done), 1);
                }
-                up(&jl->j_commit_lock);
+                mutex_unlock(&jl->j_commit_mutex);
                goto put_jl;
        }
@@ -1181,7 +1179,7 @@ static int flush_commit_list(struct super_block *s,
        if (flushall) {
                atomic_set(&(jl->j_older_commits_done), 1);
        }
-        up(&jl->j_commit_lock);
+        mutex_unlock(&jl->j_commit_mutex);
      put_jl:
        put_journal_list(s, jl);
@@ -1411,8 +1409,8 @@ static int flush_journal_list(struct super_block *s,
        /* if flushall == 0, the lock is already held */
        if (flushall) {
-                down(&journal->j_flush_sem);
+                mutex_lock(&journal->j_flush_mutex);
-        } else if (!down_trylock(&journal->j_flush_sem)) {
+        } else if (mutex_trylock(&journal->j_flush_mutex)) {
                BUG();
        }
@@ -1642,7 +1640,7 @@ static int flush_journal_list(struct super_block *s,
        jl->j_state = 0;
        put_journal_list(s, jl);
        if (flushall)
-                up(&journal->j_flush_sem);
+                mutex_unlock(&journal->j_flush_mutex);
        put_fs_excl();
        return err;
 }
@@ -1772,12 +1770,12 @@ static int kupdate_transactions(struct super_block *s,
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        chunk.nr = 0;
-        down(&journal->j_flush_sem);
+        mutex_lock(&journal->j_flush_mutex);
        if (!journal_list_still_alive(s, orig_trans_id)) {
                goto done;
        }
-        /* we've got j_flush_sem held, nobody is going to delete any
+        /* we've got j_flush_mutex held, nobody is going to delete any
         * of these lists out from underneath us
         */
        while ((num_trans && transactions_flushed < num_trans) ||
@@ -1812,7 +1810,7 @@ static int kupdate_transactions(struct super_block *s,
        }
      done:
-        up(&journal->j_flush_sem);
+        mutex_unlock(&journal->j_flush_mutex);
        return ret;
 }
@@ -2556,7 +2554,7 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
        INIT_LIST_HEAD(&jl->j_working_list);
        INIT_LIST_HEAD(&jl->j_tail_bh_list);
        INIT_LIST_HEAD(&jl->j_bh_list);
-        sema_init(&jl->j_commit_lock, 1);
+        mutex_init(&jl->j_commit_mutex);
        SB_JOURNAL(s)->j_num_lists++;
        get_journal_list(jl);
        return jl;
@@ -2837,8 +2835,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
        journal->j_last = NULL;
        journal->j_first = NULL;
        init_waitqueue_head(&(journal->j_join_wait));
-        sema_init(&journal->j_lock, 1);
+        mutex_init(&journal->j_mutex);
-        sema_init(&journal->j_flush_sem, 1);
+        mutex_init(&journal->j_flush_mutex);
        journal->j_trans_id = 10;
        journal->j_mount_id = 10;
@@ -3873,7 +3871,7 @@ int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
 {
        PROC_INFO_INC(p_s_sb, journal.prepare);
-        if (test_set_buffer_locked(bh)) {
+        if (!trylock_buffer(bh)) {
                if (!wait)
                        return 0;
                lock_buffer(bh);
@@ -4030,7 +4028,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         * the new transaction is fully setup, and we've already flushed the
         * ordered bh list
         */
-        down(&jl->j_commit_lock);
+        mutex_lock(&jl->j_commit_mutex);
        /* save the transaction id in case we need to commit it later */
        commit_trans_id = jl->j_trans_id;
@@ -4196,7 +4194,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
                lock_kernel();
        }
        BUG_ON(!list_empty(&jl->j_tail_bh_list));
-        up(&jl->j_commit_lock);
+        mutex_unlock(&jl->j_commit_mutex);
        /* honor the flush wishes from the caller, simple commits can
         ** be done outside the journal lock, they are done below
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1d40f2bd1970..d318c7e663fa 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -22,11 +22,11 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
+#include <linux/quotaops.h>
 #include <linux/vfs.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/quotaops.h>
 struct file_system_type reiserfs_fs_type;
@@ -182,7 +182,7 @@ static int finish_unfinished(struct super_block *s)
                        int ret = reiserfs_quota_on_mount(s, i);
                        if (ret < 0)
                                reiserfs_warning(s,
-                                                 "reiserfs: cannot turn on journalled quota: error %d",
+                                                 "reiserfs: cannot turn on journaled quota: error %d",
                                                 ret);
                }
        }
@@ -520,7 +520,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
        kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -876,7 +876,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                                     mount options were selected. */
                                  unsigned long *blocks,        /* strtol-ed from NNN of resize=NNN */
                                  char **jdev_name,
-                                  unsigned int *commit_max_age)
+                                  unsigned int *commit_max_age,
+                                  char **qf_names,
+                                  unsigned int *qfmt)
 {
        int c;
        char *arg = NULL;
@@ -992,9 +994,11 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                if (c == 'u' || c == 'g') {
                        int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-                        if (sb_any_quota_enabled(s)) {
+                        if ((sb_any_quota_enabled(s) ||
+                             sb_any_quota_suspended(s)) &&
+                            (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
                                reiserfs_warning(s,
-                                                 "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
+                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
                                return 0;
                        }
                        if (*arg) {     /* Some filename specified? */
@@ -1011,46 +1015,54 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                                                         "reiserfs_parse_options: quotafile must be on filesystem root.");
                                        return 0;
                                }
-                                REISERFS_SB(s)->s_qf_names[qtype] =
+                                qf_names[qtype] =
                                    kmalloc(strlen(arg) + 1, GFP_KERNEL);
-                                if (!REISERFS_SB(s)->s_qf_names[qtype]) {
+                                if (!qf_names[qtype]) {
                                        reiserfs_warning(s,
                                                         "reiserfs_parse_options: not enough memory for storing quotafile name.");
                                        return 0;
                                }
-                                strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
+                                strcpy(qf_names[qtype], arg);
                                *mount_options |= 1 << REISERFS_QUOTA;
                        } else {
-                                kfree(REISERFS_SB(s)->s_qf_names[qtype]);
+                                if (qf_names[qtype] !=
-                                REISERFS_SB(s)->s_qf_names[qtype] = NULL;
+                                    REISERFS_SB(s)->s_qf_names[qtype])
+                                        kfree(qf_names[qtype]);
+                                qf_names[qtype] = NULL;
                        }
                }
                if (c == 'f') {
                        if (!strcmp(arg, "vfsold"))
-                                REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
+                                *qfmt = QFMT_VFS_OLD;
                        else if (!strcmp(arg, "vfsv0"))
-                                REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
+                                *qfmt = QFMT_VFS_V0;
                        else {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: unknown quota format specified.");
                                return 0;
                        }
+                        if ((sb_any_quota_enabled(s) ||
+                             sb_any_quota_suspended(s)) &&
+                            *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
+                                reiserfs_warning(s,
+                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+                                return 0;
+                        }
                }
 #else
                if (c == 'u' || c == 'g' || c == 'f') {
                        reiserfs_warning(s,
-                                         "reiserfs_parse_options: journalled quota options not supported.");
+                                         "reiserfs_parse_options: journaled quota options not supported.");
                        return 0;
                }
 #endif
        }
 #ifdef CONFIG_QUOTA
-        if (!REISERFS_SB(s)->s_jquota_fmt
+        if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
-            && (REISERFS_SB(s)->s_qf_names[USRQUOTA]
+            && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
-                || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
                reiserfs_warning(s,
-                                 "reiserfs_parse_options: journalled quota format not specified.");
+                                 "reiserfs_parse_options: journaled quota format not specified.");
                return 0;
        }
        /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
@@ -1130,6 +1142,21 @@ static void handle_attrs(struct super_block *s)
        }
 }
+#ifdef CONFIG_QUOTA
+static void handle_quota_files(struct super_block *s, char **qf_names,
+                               unsigned int *qfmt)
+{
+        int i;
+        for (i = 0; i < MAXQUOTAS; i++) {
+                if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+                        kfree(REISERFS_SB(s)->s_qf_names[i]);
+                REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
+        }
+        REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+}
+#endif
 static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 {
        struct reiserfs_super_block *rs;
@@ -1141,23 +1168,30 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        struct reiserfs_journal *journal = SB_JOURNAL(s);
        char *new_opts = kstrdup(arg, GFP_KERNEL);
        int err;
+        char *qf_names[MAXQUOTAS];
+        unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
        int i;
+        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
-            (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
+            (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
+            qf_names, &qfmt)) {
 #ifdef CONFIG_QUOTA
-                for (i = 0; i < MAXQUOTAS; i++) {
+                for (i = 0; i < MAXQUOTAS; i++)
-                        kfree(REISERFS_SB(s)->s_qf_names[i]);
+                        if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
-                        REISERFS_SB(s)->s_qf_names[i] = NULL;
+                                kfree(qf_names[i]);
-                }
 #endif
                err = -EINVAL;
                goto out_err;
        }
+#ifdef CONFIG_QUOTA
+        handle_quota_files(s, qf_names, &qfmt);
+#endif
        handle_attrs(s);
@@ -1570,6 +1604,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        char *jdev_name;
        struct reiserfs_sb_info *sbi;
        int errval = -EINVAL;
+        char *qf_names[MAXQUOTAS] = {};
+        unsigned int qfmt = 0;
        save_mount_options(s, data);
@@ -1597,9 +1633,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
-             &commit_max_age) == 0) {
+             &commit_max_age, qf_names, &qfmt) == 0) {
                goto error;
        }
+#ifdef CONFIG_QUOTA
+        handle_quota_files(s, qf_names, &qfmt);
+#endif
        if (blocks) {
                SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
@@ -1819,7 +1858,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return (0);
-      error:
+error:
        if (jinit_done) {       /* kill the commit thread, free journal ram */
                journal_release_error(NULL, s);
        }
@@ -1830,10 +1869,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 #ifdef CONFIG_QUOTA
        {
                int j;
-                for (j = 0; j < MAXQUOTAS; j++) {
+                for (j = 0; j < MAXQUOTAS; j++)
-                        kfree(sbi->s_qf_names[j]);
+                        kfree(qf_names[j]);
-                        sbi->s_qf_names[j] = NULL;
-                }
        }
 #endif
        kfree(sbi);
@@ -1980,7 +2017,7 @@ static int reiserfs_release_dquot(struct dquot *dquot)
 static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
 {
-        /* Are we journalling quotas? */
+        /* Are we journaling quotas? */
        if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
            REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
                dquot_mark_dquot_dirty(dquot);
@@ -2026,6 +2063,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        int err;
        struct nameidata nd;
        struct inode *inode;
+        struct reiserfs_transaction_handle th;
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
@@ -2037,8 +2075,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                return err;
        /* Quotafile not on the same filesystem? */
        if (nd.path.mnt->mnt_sb != sb) {
-                path_put(&nd.path);
+                err = -EXDEV;
-                return -EXDEV;
+                goto out;
        }
        inode = nd.path.dentry->d_inode;
        /* We must not pack tails for quota files on reiserfs for quota IO to work */
@@ -2048,24 +2086,37 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                        reiserfs_warning(sb,
                                "reiserfs: Unpacking tail of quota file failed"
                                " (%d). Cannot turn on quotas.", err);
-                        path_put(&nd.path);
+                        err = -EINVAL;
-                        return -EINVAL;
+                        goto out;
                }
                mark_inode_dirty(inode);
        }
-        /* Not journalling quota? No more tests needed... */
+        /* Journaling quota? */
-        if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
+        if (REISERFS_SB(sb)->s_qf_names[type]) {
-            !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
+                /* Quotafile not of fs root? */
-                path_put(&nd.path);
+                if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-                return vfs_quota_on(sb, type, format_id, path, 0);
+                        reiserfs_warning(sb,
-        }
-        /* Quotafile not of fs root? */
-        if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-                reiserfs_warning(sb,
                                 "reiserfs: Quota file not on filesystem root. "
                                 "Journalled quota will not work.");
+        }
+        /*
+         * When we journal data on quota file, we have to flush journal to see
+         * all updates to the file when we bypass pagecache...
+         */
+        if (reiserfs_file_data_log(inode)) {
+                /* Just start temporary transaction and finish it */
+                err = journal_begin(&th, sb, 1);
+                if (err)
+                        goto out;
+                err = journal_end_sync(&th, sb, 1);
+                if (err)
+                        goto out;
+        }
+        err = vfs_quota_on_path(sb, type, format_id, &nd.path);
+out:
        path_put(&nd.path);
-        return vfs_quota_on(sb, type, format_id, path, 0);
+        return err;
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d7c4935c1034..bb3cb5b7cdb2 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1250,7 +1250,7 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
        return error;
 }
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int reiserfs_permission(struct inode *inode, int mask)
 {
        /*
         * We don't do permission checks on the internal objects.
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 5e90a95ad60b..056008db1377 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,8 +6,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
-#define XATTR_SECURITY_PREFIX "security."
 static int
 security_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 024a938ca60f..60abe2bb1f98 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -7,8 +7,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
-#define XATTR_TRUSTED_PREFIX "trusted."
 static int
 trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 073f39364b11..1384efcb938e 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -10,8 +10,6 @@
 # include <linux/reiserfs_acl.h>
 #endif
-#define XATTR_USER_PREFIX "user."
 static int
 user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c7..60d2f822e87b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -418,7 +418,8 @@ static int
 romfs_readpage(struct file *file, struct page * page)
 {
        struct inode *inode = page->mapping->host;
-        loff_t offset, avail, readlen;
+        loff_t offset, size;
+        unsigned long filled;
        void *buf;
        int result = -EIO;
@@ -430,21 +431,29 @@ romfs_readpage(struct file *file, struct page * page)
        /* 32 bit warning -- but not for us :) */
        offset = page_offset(page);
-        if (offset < i_size_read(inode)) {
+        size = i_size_read(inode);
-                avail = inode->i_size-offset;
+        filled = 0;
-                readlen = min_t(unsigned long, avail, PAGE_SIZE);
+        result = 0;
-                if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) {
+        if (offset < size) {
-                        if (readlen < PAGE_SIZE) {
+                unsigned long readlen;
-                                memset(buf + readlen,0,PAGE_SIZE-readlen);
-                        }
+                size -= offset;
-                        SetPageUptodate(page);
+                readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-                        result = 0;
+                filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
+                if (filled != readlen) {
+                        SetPageError(page);
+                        filled = 0;
+                        result = -EIO;
                }
        }
-        if (result) {
-                memset(buf, 0, PAGE_SIZE);
+        if (filled < PAGE_SIZE)
-                SetPageError(page);
+                memset(buf + filled, 0, PAGE_SIZE-filled);
-        }
+        if (!result)
+                SetPageUptodate(page);
        flush_dcache_page(page);
        unlock_page(page);
@@ -577,7 +586,7 @@ static void romfs_destroy_inode(struct inode *inode)
        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct romfs_inode_info *ei = foo;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3f54dbd6c49b..bd20f7f5a933 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        goto Done;
        }
        /* we need at least one record in buffer */
+        pos = m->index;
+        p = m->op->start(m, &pos);
        while (1) {
-                pos = m->index;
-                p = m->op->start(m, &pos);
                err = PTR_ERR(p);
                if (!p || IS_ERR(p))
                        break;
@@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        break;
                if (unlikely(err))
                        m->count = 0;
+                if (unlikely(!m->count)) {
+                        p = m->op->next(m, p, &pos);
+                        m->index = pos;
+                        continue;
+                }
                if (m->count < m->size)
                        goto Fill;
                m->op->stop(m, p);
@@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
                        goto Enomem;
                m->count = 0;
                m->version = 0;
+                pos = m->index;
+                p = m->op->start(m, &pos);
        }
        m->op->stop(m, p);
        m->count = 0;
@@ -443,6 +450,20 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
        return -1;
 }
+int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+{
+        size_t len = bitmap_scnprintf_len(nr_bits);
+        if (m->count + len < m->size) {
+                bitmap_scnprintf(m->buf + m->count, m->size - m->count,
+                                 bits, nr_bits);
+                m->count += len;
+                return 0;
+        }
+        m->count = m->size;
+        return -1;
+}
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
        return NULL + (*pos == 0);
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index 8182f0542a21..8c177eb7e344 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -13,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/dirent.h>
 #include <linux/smb_fs.h>
 #include <linux/pagemap.h>
 #include <linux/net.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 2294783320cb..e4f8d51a5553 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -408,7 +408,7 @@ smb_file_release(struct inode *inode, struct file * file)
 * privileges, so we need our own check for this.
 */
 static int
-smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
+smb_file_permission(struct inode *inode, int mask)
 {
        int mode = inode->i_mode;
        int error = 0;
@@ -417,7 +417,7 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
        /* Look at user permissions */
        mode >>= 6;
-        if ((mode & 7 & mask) != mask)
+        if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
                error = -EACCES;
        return error;
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed7..3528f40ffb0f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
        kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct smb_inode_info *ei = (struct smb_inode_info *) foo;
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index d517a27b7f4b..ee536e8a649a 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -16,7 +16,6 @@
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/dcache.h>
-#include <linux/dirent.h>
 #include <linux/nls.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
diff --git a/fs/splice.c b/fs/splice.c
index 399442179d89..1bbc6f4bb09c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -371,7 +371,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                         * for an in-flight io page
                         */
                        if (flags & SPLICE_F_NONBLOCK) {
-                                if (TestSetPageLocked(page)) {
+                                if (!trylock_page(page)) {
                                        error = -EAGAIN;
                                        break;
                                }
@@ -772,7 +772,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
        ssize_t ret;
        int err;
-        err = remove_suid(out->f_path.dentry);
+        err = file_remove_suid(out);
        if (unlikely(err))
                return err;
@@ -830,7 +830,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        ssize_t ret;
        inode_double_lock(inode, pipe->inode);
-        ret = remove_suid(out->f_path.dentry);
+        ret = file_remove_suid(out);
        if (likely(!ret))
                ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
        inode_double_unlock(inode, pipe->inode);
@@ -1161,36 +1161,6 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 }
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-        int partial;
-        if (!access_ok(VERIFY_READ, src, n))
-                return -EFAULT;
-        pagefault_disable();
-        partial = __copy_from_user_inatomic(dst, src, n);
-        pagefault_enable();
-        /*
-         * Didn't copy everything, drop the mmap_sem and do a faulting copy
-         */
-        if (unlikely(partial)) {
-                up_read(&current->mm->mmap_sem);
-                partial = copy_from_user(dst, src, n);
-                down_read(&current->mm->mmap_sem);
-        }
-        return partial;
-}
-/*
 * Map an iov into an array of pages and offset/length tupples. With the
 * partial_page structure, we can map several non-contiguous ranges into
 * our ones pages[] map instead of splitting that operation into pieces.
@@ -1203,8 +1173,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 {
        int buffers = 0, error = 0;
-        down_read(&current->mm->mmap_sem);
        while (nr_vecs) {
                unsigned long off, npages;
                struct iovec entry;
@@ -1213,7 +1181,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                int i;
                error = -EFAULT;
-                if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+                if (copy_from_user(&entry, iov, sizeof(entry)))
                        break;
                base = entry.iov_base;
@@ -1247,9 +1215,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                if (npages > PIPE_BUFFERS - buffers)
                        npages = PIPE_BUFFERS - buffers;
-                error = get_user_pages(current, current->mm,
+                error = get_user_pages_fast((unsigned long)base, npages,
-                                       (unsigned long) base, npages, 0, 0,
+                                        0, &pages[buffers]);
-                                       &pages[buffers], NULL);
                if (unlikely(error <= 0))
                        break;
@@ -1288,8 +1255,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
                iov++;
        }
-        up_read(&current->mm->mmap_sem);
        if (buffers)
                return buffers;
diff --git a/fs/stat.c b/fs/stat.c
index 9cf41f719d50..7c46fbeb8b76 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
 int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
+        error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
        if (!error) {
-                error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
+                error = vfs_getattr(path.mnt, path.dentry, stat);
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
@@ -77,13 +77,13 @@ EXPORT_SYMBOL(vfs_stat);
 int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = __user_walk_fd(dfd, name, 0, &nd);
+        error = user_path_at(dfd, name, 0, &path);
        if (!error) {
-                error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
+                error = vfs_getattr(path.mnt, path.dentry, stat);
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
@@ -291,29 +291,29 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
        return error;
 }
-asmlinkage long sys_readlinkat(int dfd, const char __user *path,
+asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
                                char __user *buf, int bufsiz)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
        if (bufsiz <= 0)
                return -EINVAL;
-        error = __user_walk_fd(dfd, path, 0, &nd);
+        error = user_path_at(dfd, pathname, 0, &path);
        if (!error) {
-                struct inode *inode = nd.path.dentry->d_inode;
+                struct inode *inode = path.dentry->d_inode;
                error = -EINVAL;
                if (inode->i_op && inode->i_op->readlink) {
-                        error = security_inode_readlink(nd.path.dentry);
+                        error = security_inode_readlink(path.dentry);
                        if (!error) {
-                                touch_atime(nd.path.mnt, nd.path.dentry);
+                                touch_atime(path.mnt, path.dentry);
-                                error = inode->i_op->readlink(nd.path.dentry,
+                                error = inode->i_op->readlink(path.dentry,
                                                              buf, bufsiz);
                        }
                }
-                path_put(&nd.path);
+                path_put(&path);
        }
        return error;
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index c1a7efb310bf..aedaeba82ae5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -459,11 +459,8 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
        int ret;
        ret = __sysfs_add_one(acxt, sd);
-        if (ret == -EEXIST) {
+        WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' "
-                printk(KERN_WARNING "sysfs: duplicate filename '%s' "
                       "can not be created\n", sd->s_name);
-                WARN_ON(1);
-        }
        return ret;
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 3f07893ff896..c9e4e5091da1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -337,9 +337,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
        if (kobj->ktype && kobj->ktype->sysfs_ops)
                ops = kobj->ktype->sysfs_ops;
        else {
-                printk(KERN_ERR "missing sysfs attribute operations for "
+                WARN(1, KERN_ERR "missing sysfs attribute operations for "
                       "kobject: %s\n", kobject_name(kobj));
-                WARN_ON(1);
                goto err_out;
        }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index eeba38417b1d..fe611949a7f7 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -134,9 +134,8 @@ void sysfs_remove_group(struct kobject * kobj,
        if (grp->name) {
                sd = sysfs_get_dirent(dir_sd, grp->name);
                if (!sd) {
-                        printk(KERN_WARNING "sysfs group %p not found for "
+                        WARN(!sd, KERN_WARNING "sysfs group %p not found for "
                                "kobject '%s'\n", grp, kobject_name(kobj));
-                        WARN_ON(!sd);
                        return;
                }
        } else
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658f..df0d435baa48 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
        kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
 {
        struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index d81fb9ed2b8e..73db464cd08b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-        /* And make sure we have twice the index size of space reserved */
+        /* And make sure we have thrice the index size of space reserved */
-        idx_size <<= 1;
+        idx_size = idx_size + (idx_size << 1);
        /*
         * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
        int subtract_lebs;
        long long available;
-        /*
-         * Force the amount available to the total size reported if the used
-         * space is zero.
-         */
-        if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
-            c->budg_data_growth + c->budg_dd_growth == 0) {
-                /* Do the same calculation as for c->block_cnt */
-                available = c->main_lebs - 2;
-                available *= c->leb_size - c->dark_wm;
-                return available;
-        }
        available = c->main_bytes - c->lst.total_used;
        /*
@@ -388,11 +376,11 @@ static int can_use_rp(struct ubifs_info *c)
 * This function makes sure UBIFS has enough free eraseblocks for index growth
 * and data.
 *
- * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
 * would take if it was consolidated and written to the flash. This guarantees
 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
 * be able to commit dirty index. So this function basically adds amount of
- * budgeted index space to the size of the current index, multiplies this by 2,
+ * budgeted index space to the size of the current index, multiplies this by 3,
 * and makes sure this does not exceed the amount of free eraseblocks.
 *
 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
@@ -543,8 +531,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
        int err, idx_growth, data_growth, dd_growth;
        struct retries_info ri;
+        ubifs_assert(req->new_page <= 1);
+        ubifs_assert(req->dirtied_page <= 1);
+        ubifs_assert(req->new_dent <= 1);
+        ubifs_assert(req->mod_dent <= 1);
+        ubifs_assert(req->new_ino <= 1);
+        ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
        ubifs_assert(req->dirtied_ino <= 4);
        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        ubifs_assert(!(req->new_ino_d & 7));
+        ubifs_assert(!(req->dirtied_ino_d & 7));
        data_growth = calc_data_growth(c, req);
        dd_growth = calc_dd_growth(c, req);
@@ -618,8 +614,16 @@ again:
 */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
+        ubifs_assert(req->new_page <= 1);
+        ubifs_assert(req->dirtied_page <= 1);
+        ubifs_assert(req->new_dent <= 1);
+        ubifs_assert(req->mod_dent <= 1);
+        ubifs_assert(req->new_ino <= 1);
+        ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
        ubifs_assert(req->dirtied_ino <= 4);
        ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+        ubifs_assert(!(req->new_ino_d & 7));
+        ubifs_assert(!(req->dirtied_ino_d & 7));
        if (!req->recalculate) {
                ubifs_assert(req->idx_growth >= 0);
                ubifs_assert(req->data_growth >= 0);
@@ -647,7 +651,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
        ubifs_assert(c->budg_idx_growth >= 0);
        ubifs_assert(c->budg_data_growth >= 0);
+        ubifs_assert(c->budg_dd_growth >= 0);
        ubifs_assert(c->min_idx_lebs < c->main_lebs);
+        ubifs_assert(!(c->budg_idx_growth & 7));
+        ubifs_assert(!(c->budg_data_growth & 7));
+        ubifs_assert(!(c->budg_dd_growth & 7));
        spin_unlock(&c->space_lock);
 }
@@ -686,41 +694,114 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
                                      struct ubifs_inode *ui)
 {
-        struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
+        struct ubifs_budget_req req;
-                                       .dirtied_ino_d = ui->data_len};
+        memset(&req, 0, sizeof(struct ubifs_budget_req));
+        req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
        ubifs_release_budget(c, &req);
 }
 /**
- * ubifs_budg_get_free_space - return amount of free space.
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+        int divisor, factor, f;
+        /*
+         * Reported space size is @free * X, where X is UBIFS block size
+         * divided by UBIFS block size + all overhead one data block
+         * introduces. The overhead is the node header + indexing overhead.
+         *
+         * Indexing overhead calculations are based on the following formula:
+         * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+         * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+         * as less than maximum fanout, we assume that each data node
+         * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+         * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+         * for the index.
+         */
+        f = c->fanout > 3 ? c->fanout >> 1 : 2;
+        factor = UBIFS_BLOCK_SIZE;
+        divisor = UBIFS_MAX_DATA_NODE_SZ;
+        divisor += (c->max_idx_node_sz * 3) / (f - 1);
+        free *= factor;
+        do_div(free, divisor);
+        return free;
+}
+/**
+ * ubifs_get_free_space - return amount of free space.
 * @c: UBIFS file-system description object
 *
- * This function returns amount of free space on the file-system.
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
 */
-long long ubifs_budg_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space(struct ubifs_info *c)
 {
-        int min_idx_lebs, rsvd_idx_lebs;
+        int min_idx_lebs, rsvd_idx_lebs, lebs;
        long long available, outstanding, free;
-        /* Do exactly the same calculations as in 'do_budget_space()' */
        spin_lock(&c->space_lock);
        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        if (min_idx_lebs > c->lst.idx_lebs)
+        /*
-                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+         * Force the amount available to the total size reported if the used
-        else
+         * space is zero.
-                rsvd_idx_lebs = 0;
+         */
+        if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-        if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
-                                - c->lst.taken_empty_lebs) {
                spin_unlock(&c->space_lock);
-                return 0;
+                return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
        }
        available = ubifs_calc_available(c, min_idx_lebs);
-        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        c->min_idx_lebs = min_idx_lebs;
+        /*
+         * When reporting free space to user-space, UBIFS guarantees that it is
+         * possible to write a file of free space size. This means that for
+         * empty LEBs we may use more precise calculations than
+         * 'ubifs_calc_available()' is using. Namely, we know that in empty
+         * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+         * Thus, amend the available space.
+         *
+         * Note, the calculations below are similar to what we have in
+         * 'do_budget_space()', so refer there for comments.
+         */
+        if (min_idx_lebs > c->lst.idx_lebs)
+                rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+        else
+                rsvd_idx_lebs = 0;
+        lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+               c->lst.taken_empty_lebs;
+        lebs -= rsvd_idx_lebs;
+        available += lebs * (c->dark_wm - c->leb_overhead);
        spin_unlock(&c->space_lock);
        if (available > outstanding)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 3b516316c9b3..0a6aa2cc78f0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c)
                        goto out_up;
        }
+        c->cmt_no += 1;
        err = ubifs_gc_start_commit(c);
        if (err)
                goto out_up;
@@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c)
                goto out;
        mutex_lock(&c->mst_mutex);
-        c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no);
+        c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
        c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
        c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
        c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4e3aaeba4eca..b9cb77473758 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
 {
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+        printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
-               lst->empty_lebs, lst->idx_lebs);
+               "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
        printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
               "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
               lst->total_dirty);
@@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct ubifs_gced_idx_leb *idx_gc;
        spin_lock(&dbg_lock);
-        printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
+        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
-               "budg_dd_growth %lld, budg_idx_growth %lld\n",
+               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
               c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
        printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
               "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
@@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "Dumping LEB properties\n");
+        printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_failure_mode)
                return;
-        printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+        printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
        if (IS_ERR(sleb)) {
@@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
+        printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
-               cat, heap->cnt);
+               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
@@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "Dumping pnode:\n");
+        printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
        int level;
        printk(KERN_DEBUG "\n");
-        printk(KERN_DEBUG "Dumping the TNC tree\n");
+        printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
        printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
                  int offset, int len, int dtype)
 {
-        int err;
+        int err, failing;
        if (in_failure_mode(desc))
                return -EIO;
-        if (do_fail(desc, lnum, 1))
+        failing = do_fail(desc, lnum, 1);
+        if (failing)
                cut_data(buf, len);
        err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
        if (err)
                return err;
-        if (in_failure_mode(desc))
+        if (failing)
                return -EIO;
        return 0;
 }
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 3c4f1e93c9e0..50315fc57185 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,7 @@
 #define UBIFS_DBG(op) op
-#define ubifs_assert(expr)  do {                                               \
+#define ubifs_assert(expr) do {                                                \
        if (unlikely(!(expr))) {                                               \
                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
                       __func__, __LINE__, current->pid);                      \
@@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c,
                         const union ubifs_key *key);
 /*
- * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
 * macros.
 */
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
 /* General messages */
-#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
 /* Additional log messages */
-#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 /*
 * Debugging message type flags (must match msg_type_names in debug.c).
@@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
                                 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
                                  struct ubifs_znode *znode, void *priv);
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
                   dbg_znode_callback znode_cb, void *priv);
 /* Checking functions */
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_cats(struct ubifs_info *c);
 int dbg_check_ltab(struct ubifs_info *c);
 int dbg_check_synced_i_size(struct inode *inode);
 int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
 int dbg_check_tnc(struct ubifs_info *c, int extra);
 int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
 int dbg_check_filesystem(struct ubifs_info *c);
 void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
                    int add_pos);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
@@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #else /* !CONFIG_UBIFS_FS_DEBUG */
 #define UBIFS_DBG(op)
-#define ubifs_assert(expr)                         ({})
-#define ubifs_assert_cmt_locked(c)
+/* Use "if (0)" to make compiler check arguments even if debugging is off */
+#define ubifs_assert(expr)  do {                                               \
+        if (0 && (expr))                                                       \
+                printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+                       __func__, __LINE__, current->pid);                      \
+} while (0)
+#define dbg_err(fmt, ...)   do {                                               \
+        if (0)                                                                 \
+                ubifs_err(fmt, ##__VA_ARGS__);                                 \
+} while (0)
+#define dbg_msg(fmt, ...) do {                                                 \
+        if (0)                                                                 \
+                printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+                       current->pid, __func__, ##__VA_ARGS__);                 \
+} while (0)
 #define dbg_dump_stack()
-#define dbg_err(fmt, ...)                          ({})
+#define ubifs_assert_cmt_locked(c)
-#define dbg_msg(fmt, ...)                          ({})
-#define dbg_key(c, key, fmt, ...)                  ({})
-#define dbg_gen(fmt, ...)                          ({})
-#define dbg_jnl(fmt, ...)                          ({})
-#define dbg_tnc(fmt, ...)                          ({})
-#define dbg_lp(fmt, ...)                           ({})
-#define dbg_find(fmt, ...)                         ({})
-#define dbg_mnt(fmt, ...)                          ({})
-#define dbg_io(fmt, ...)                           ({})
-#define dbg_cmt(fmt, ...)                          ({})
-#define dbg_budg(fmt, ...)                         ({})
-#define dbg_log(fmt, ...)                          ({})
-#define dbg_gc(fmt, ...)                           ({})
-#define dbg_scan(fmt, ...)                         ({})
-#define dbg_rcvry(fmt, ...)                        ({})
-#define dbg_ntype(type)                            ""
-#define dbg_cstate(cmt_state)                      ""
-#define dbg_get_key_dump(c, key)                   ({})
-#define dbg_dump_inode(c, inode)                   ({})
-#define dbg_dump_node(c, node)                     ({})
-#define dbg_dump_budget_req(req)                   ({})
-#define dbg_dump_lstats(lst)                       ({})
-#define dbg_dump_budg(c)                           ({})
-#define dbg_dump_lprop(c, lp)                      ({})
-#define dbg_dump_lprops(c)                         ({})
-#define dbg_dump_leb(c, lnum)                      ({})
-#define dbg_dump_znode(c, znode)                   ({})
-#define dbg_dump_heap(c, heap, cat)                ({})
-#define dbg_dump_pnode(c, pnode, parent, iip)      ({})
-#define dbg_dump_tnc(c)                            ({})
-#define dbg_dump_index(c)                          ({})
-#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define DBGKEY(key)  ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+#define dbg_ntype(type)                       ""
+#define dbg_cstate(cmt_state)                 ""
+#define dbg_get_key_dump(c, key)              ({})
+#define dbg_dump_inode(c, inode)              ({})
+#define dbg_dump_node(c, node)                ({})
+#define dbg_dump_budget_req(req)              ({})
+#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_index(c)                     ({})
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
 #define dbg_check_synced_i_size(inode)             0
 #define dbg_check_dir_size(c, dir)                 0
 #define dbg_check_tnc(c, x)                        0
 #define dbg_check_idx_size(c, idx_size)            0
 #define dbg_check_filesystem(c)                    0
 #define dbg_check_heap(c, heap, cat, add_pos)      ({})
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
 #define dbg_failure_mode_registration(c)           ({})
 #define dbg_failure_mode_deregistration(c)         ({})
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e90374be7d3b..2b267c9a1806 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
        }
        inode->i_ino = ++c->highest_inum;
-        inode->i_generation = ++c->vfs_gen;
        /*
         * The creation sequence number remains with this inode for its
         * lifetime. All nodes for this inode have a greater sequence number,
@@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
        err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
        if (err) {
-                /*
+                if (err == -ENOENT) {
-                 * Do not hash the direntry if parent 'i_nlink' is zero, because
-                 * this has side-effects - '->delete_inode()' call will not be
-                 * called for the parent orphan inode, because 'd_count' of its
-                 * direntry will stay 1 (it'll be negative direntry I guess)
-                 * and prevent 'iput_final()' until the dentry is destroyed due
-                 * to unmount or memory pressure.
-                 */
-                if (err == -ENOENT && dir->i_nlink != 0) {
                        dbg_gen("not found");
                        goto done;
                }
@@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
-                                        .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
        /*
         * Budget request settings: new direntry, changing the target inode,
@@ -596,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
        if (err) {
                if (err != -ENOSPC)
                        return err;
-                err = 0;
                budgeted = 0;
        }
@@ -727,8 +717,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        struct ubifs_info *c = dir->i_sb->s_fs_info;
        int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
-        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
-                                        .dirtied_ino_d = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -789,7 +778,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        int err, devlen = 0;
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = devlen, .dirtied_ino = 1 };
+                                        .new_ino_d = ALIGN(devlen, 8),
+                                        .dirtied_ino = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -863,7 +853,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
        int err, len = strlen(symname);
        int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = len, .dirtied_ino = 1 };
+                                        .new_ino_d = ALIGN(len, 8),
+                                        .dirtied_ino = 1 };
        /*
         * Budget request settings: new inode, new direntry and changing parent
@@ -1012,7 +1003,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
                                        .dirtied_ino = 3 };
        struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
-                                .dirtied_ino_d = old_inode_ui->data_len };
+                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
        /*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 005a3b854d96..3d698e2022b1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -53,6 +53,7 @@
 #include "ubifs.h"
 #include <linux/mount.h>
+#include <linux/namei.h>
 static int read_block(struct inode *inode, void *addr, unsigned int block,
                      struct ubifs_data_node *dn)
@@ -792,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        int err;
        struct ubifs_budget_req req;
        loff_t old_size = inode->i_size, new_size = attr->ia_size;
-        int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+        int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
        struct ubifs_inode *ui = ubifs_inode(inode);
        dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -810,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        /* A funny way to budget for truncation node */
        req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
        err = ubifs_budget_space(c, &req);
-        if (err)
+        if (err) {
-                return err;
+                /*
+                 * Treat truncations to zero as deletion and always allow them,
+                 * just like we do for '->unlink()'.
+                 */
+                if (new_size || err != -ENOSPC)
+                        return err;
+                budgeted = 0;
+        }
        err = vmtruncate(inode, new_size);
        if (err)
@@ -868,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
        err = ubifs_jnl_truncate(c, inode, old_size, new_size);
        mutex_unlock(&ui->ui_mutex);
 out_budg:
-        ubifs_release_budget(c, &req);
+        if (budgeted)
+                ubifs_release_budget(c, &req);
+        else {
+                c->nospace = c->nospace_rp = 0;
+                smp_wmb();
+        }
        return err;
 }
@@ -889,7 +902,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        loff_t new_size = attr->ia_size;
        struct ubifs_inode *ui = ubifs_inode(inode);
        struct ubifs_budget_req req = { .dirtied_ino = 1,
-                                        .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
        err = ubifs_budget_space(c, &req);
        if (err)
@@ -940,7 +953,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
-        dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+        dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+                inode->i_ino, inode->i_mode, attr->ia_valid);
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -1050,7 +1064,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
        if (mctime_update_needed(inode, &now)) {
                int err, release;
                struct ubifs_budget_req req = { .dirtied_ino = 1,
-                                                .dirtied_ino_d = ui->data_len };
+                                .dirtied_ino_d = ALIGN(ui->data_len, 8) };
                err = ubifs_budget_space(c, &req);
                if (err)
@@ -1269,6 +1283,7 @@ struct file_operations ubifs_file_operations = {
        .fsync          = ubifs_fsync,
        .unlocked_ioctl = ubifs_ioctl,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 10394c548367..e045c8b55423 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
 * or do not have an LEB which satisfies the @min_space criteria.
 *
- * Note:
+ * Note, LEBs which have less than dead watermark of free + dirty space are
- *   o LEBs which have less than dead watermark of dirty space are never picked
+ * never picked by this function.
- *   by this function;
- *
- * Returns zero and the LEB properties of
- * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
- * negative error code in case of other failures. The returned LEB is marked as
- * "taken".
 *
 * The additional @pick_free argument controls if this function has to return a
 * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 *
 * In addition @pick_free is set to %2 by the recovery process in order to
 * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
 */
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                         int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                int lebs, rsvd_idx_lebs = 0;
                spin_lock(&c->space_lock);
-                lebs = c->lst.empty_lebs;
+                lebs = c->lst.empty_lebs + c->idx_gc_cnt;
                lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
                /*
@@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                idx_lp = idx_heap->arr[0];
                sum = idx_lp->free + idx_lp->dirty;
                /*
-                 * Since we reserve twice as more space for the index than it
+                 * Since we reserve thrice as much space for the index than it
                 * actually takes, it does not make sense to pick indexing LEBs
-                 * with less than half LEB of dirty space.
+                 * with less than, say, half LEB of dirty space. May be half is
+                 * not the optimal boundary - this should be tested and
+                 * checked. This boundary should determine how much we use
+                 * in-the-gaps to consolidate the index comparing to how much
+                 * we use garbage collector to consolidate it. The "half"
+                 * criteria just feels to be fine.
                 */
                if (sum < min_space || sum < c->half_leb_size)
                        idx_lp = NULL;
@@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
                lp = idx_lp;
        if (lp) {
-                ubifs_assert(lp->dirty >= c->dead_wm);
+                ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
                goto found;
        }
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..13f1019c859f 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -344,6 +344,12 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
                if (err)
                        goto out;
+                /* Allow for races with TNC */
+                c->gced_lnum = lnum;
+                smp_wmb();
+                c->gc_seq += 1;
+                smp_wmb();
                if (c->gc_lnum == -1) {
                        c->gc_lnum = lnum;
                        err = LEB_RETAINED;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3374f91b6709..054363f2b207 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -54,6 +54,20 @@
 #include "ubifs.h"
 /**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+        if (!c->ro_media) {
+                c->ro_media = 1;
+                ubifs_warn("switched to read-only mode, error %d", err);
+                dbg_dump_stack();
+        }
+}
+/**
 * ubifs_check_node - check node.
 * @c: UBIFS file-system description object
 * @buf: node to check
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 283155abe5f5..22993f867d19 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -447,13 +447,11 @@ static int get_dent_type(int mode)
 * @ino: buffer in which to pack inode node
 * @inode: inode to pack
 * @last: indicates the last node of the group
- * @last_reference: non-zero if this is a deletion inode
 */
 static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
-                       const struct inode *inode, int last,
+                       const struct inode *inode, int last)
-                       int last_reference)
 {
-        int data_len = 0;
+        int data_len = 0, last_reference = !inode->i_nlink;
        struct ubifs_inode *ui = ubifs_inode(inode);
        ino->ch.node_type = UBIFS_INO_NODE;
@@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
        ubifs_prep_grp_node(c, dent, dlen, 0);
        ino = (void *)dent + aligned_dlen;
-        pack_inode(c, ino, inode, 0, last_reference);
+        pack_inode(c, ino, inode, 0);
        ino = (void *)ino + aligned_ilen;
-        pack_inode(c, ino, dir, 1, 0);
+        pack_inode(c, ino, dir, 1);
        if (last_reference) {
                err = ubifs_add_orphan(c, inode->i_ino);
@@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
                        release_head(c, BASEHD);
                        goto out_finish;
                }
+                ui->del_cmtno = c->cmt_no;
        }
        err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -750,30 +749,25 @@ out_free:
 * ubifs_jnl_write_inode - flush inode to the journal.
 * @c: UBIFS file-system description object
 * @inode: inode to flush
- * @deletion: inode has been deleted
 *
 * This function writes inode @inode to the journal. If the inode is
 * synchronous, it also synchronizes the write-buffer. Returns zero in case of
 * success and a negative error code in case of failure.
 */
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
-                          int deletion)
 {
-        int err, len, lnum, offs, sync = 0;
+        int err, lnum, offs;
        struct ubifs_ino_node *ino;
        struct ubifs_inode *ui = ubifs_inode(inode);
+        int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
-        dbg_jnl("ino %lu%s", inode->i_ino,
+        dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
-                deletion ? " (last reference)" : "");
-        if (deletion)
-                ubifs_assert(inode->i_nlink == 0);
-        len = UBIFS_INO_NODE_SZ;
        /*
         * If the inode is being deleted, do not write the attached data. No
         * need to synchronize the write-buffer either.
         */
-        if (!deletion) {
+        if (!last_reference) {
                len += ui->data_len;
                sync = IS_SYNC(inode);
        }
@@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, inode, 1, deletion);
+        pack_inode(c, ino, inode, 1);
        err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
        if (err)
                goto out_release;
@@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
                                          inode->i_ino);
        release_head(c, BASEHD);
-        if (deletion) {
+        if (last_reference) {
                err = ubifs_tnc_remove_ino(c, inode->i_ino);
                if (err)
                        goto out_ro;
@@ -828,6 +822,65 @@ out_free:
 }
 /**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+        int err;
+        struct ubifs_inode *ui = ubifs_inode(inode);
+        ubifs_assert(inode->i_nlink == 0);
+        if (ui->del_cmtno != c->cmt_no)
+                /* A commit happened for sure */
+                return ubifs_jnl_write_inode(c, inode);
+        down_read(&c->commit_sem);
+        /*
+         * Check commit number again, because the first test has been done
+         * without @c->commit_sem, so a commit might have happened.
+         */
+        if (ui->del_cmtno != c->cmt_no) {
+                up_read(&c->commit_sem);
+                return ubifs_jnl_write_inode(c, inode);
+        }
+        err = ubifs_tnc_remove_ino(c, inode->i_ino);
+        if (err)
+                ubifs_ro_mode(c, err);
+        else
+                ubifs_delete_orphan(c, inode->i_ino);
+        up_read(&c->commit_sem);
+        return err;
+}
+/**
 * ubifs_jnl_rename - rename a directory entry.
 * @c: UBIFS file-system description object
 * @old_dir: parent inode of directory entry to rename
@@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
        p = (void *)dent2 + aligned_dlen2;
        if (new_inode) {
-                pack_inode(c, p, new_inode, 0, last_reference);
+                pack_inode(c, p, new_inode, 0);
                p += ALIGN(ilen, 8);
        }
        if (!move)
-                pack_inode(c, p, old_dir, 1, 0);
+                pack_inode(c, p, old_dir, 1);
        else {
-                pack_inode(c, p, old_dir, 0, 0);
+                pack_inode(c, p, old_dir, 0);
                p += ALIGN(plen, 8);
-                pack_inode(c, p, new_dir, 1, 0);
+                pack_inode(c, p, new_dir, 1);
        }
        if (last_reference) {
@@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
                        release_head(c, BASEHD);
                        goto out_finish;
                }
+                new_ui->del_cmtno = c->cmt_no;
        }
        err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, inode, 0, 0);
+        pack_inode(c, ino, inode, 0);
        ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
        if (dlen)
                ubifs_prep_grp_node(c, dn, dlen, 1);
@@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
        ubifs_prep_grp_node(c, xent, xlen, 0);
        ino = (void *)xent + aligned_xlen;
-        pack_inode(c, ino, inode, 0, 1);
+        pack_inode(c, ino, inode, 0);
        ino = (void *)ino + UBIFS_INO_NODE_SZ;
-        pack_inode(c, ino, host, 1, 0);
+        pack_inode(c, ino, host, 1);
        err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
        if (!sync && !err)
@@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
                           const struct inode *host)
 {
        int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
-        struct ubifs_inode *host_ui = ubifs_inode(inode);
+        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_ino_node *ino;
        union ubifs_key key;
        int sync = IS_DIRSYNC(host);
@@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
        if (err)
                goto out_free;
-        pack_inode(c, ino, host, 0, 0);
+        pack_inode(c, ino, host, 0);
-        pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+        pack_inode(c, (void *)ino + aligned_len1, inode, 1);
        err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
        if (!sync && !err) {
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36857b9ed59e..3e0aa7367556 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        return 0;
 out_unlock:
+        if (err != -EAGAIN)
+                ubifs_ro_mode(c, err);
        mutex_unlock(&c->log_mutex);
        kfree(ref);
        kfree(bud);
@@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
                return -ENOMEM;
        cs->ch.node_type = UBIFS_CS_NODE;
-        cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+        cs->cmt_no = cpu_to_le64(c->cmt_no);
        ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
        /*
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4beccfc256d2..4c12a9215d7f 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
 }
 /**
- * ubifs_ro_mode - switch UBIFS to read read-only mode.
- * @c: UBIFS file-system description object
- * @err: error code which is the reason of switching to R/O mode
- */
-static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
-{
-        if (!c->ro_media) {
-                c->ro_media = 1;
-                ubifs_warn("switched to read-only mode, error %d", err);
-                dbg_dump_stack();
-        }
-}
-/**
 * ubifs_compr_present - check if compressor was compiled in.
 * @compr_type: compressor type to check
 *
@@ -298,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
 }
 /**
- * ubifs_reported_space - calculate reported free space.
- * @c: the UBIFS file-system description object
- * @free: amount of free space
- *
- * This function calculates amount of free space which will be reported to
- * user-space. User-space application tend to expect that if the file-system
- * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
- * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
- *
- * This function assumes free space is made up of uncompressed data nodes and
- * full index nodes (one per data node, doubled because we always allow enough
- * space to write the index twice).
- *
- * Note, the calculation is pessimistic, which means that most of the time
- * UBIFS reports less space than it actually has.
- */
-static inline long long ubifs_reported_space(const struct ubifs_info *c,
-                                             uint64_t free)
-{
-        int divisor, factor;
-        divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
-        factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
-        do_div(free, divisor);
-        return free * factor;
-}
-/**
 * ubifs_current_time - round current time to time granularity.
 * @inode: inode
 */
@@ -339,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+                                   const union ubifs_key *key, void *node)
+{
+        return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 3afeb9242c6a..02d3462f4d3e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
        c->cmt_orphans -= cnt;
        spin_unlock(&c->orphan_lock);
        if (c->cmt_orphans)
-                orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+                orph->cmt_no = cpu_to_le64(c->cmt_no);
        else
                /* Mark the last node of the commit */
-                orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+                orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
        ubifs_assert(c->ohead_offs + len <= c->leb_size);
        ubifs_assert(c->ohead_lnum >= c->orph_first);
        ubifs_assert(c->ohead_lnum <= c->orph_last);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 00eb9c68ad03..7562464ac83f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -30,7 +30,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
-#include <linux/random.h>
 #include <linux/kthread.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
@@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
        if (err)
                goto out_invalid;
-        /* Disable readahead */
+        /* Disable read-ahead */
        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (inode->i_mode & S_IFMT) {
@@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode)
 */
 static int ubifs_write_inode(struct inode *inode, int wait)
 {
-        int err;
+        int err = 0;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        struct ubifs_inode *ui = ubifs_inode(inode);
@@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait)
                return 0;
        }
-        dbg_gen("inode %lu", inode->i_ino);
+        /*
-        err = ubifs_jnl_write_inode(c, inode, 0);
+         * As an optimization, do not write orphan inodes to the media just
-        if (err)
+         * because this is not needed.
-                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+         */
+        dbg_gen("inode %lu, mode %#x, nlink %u",
+                inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+        if (inode->i_nlink) {
+                err = ubifs_jnl_write_inode(c, inode);
+                if (err)
+                        ubifs_err("can't write inode %lu, error %d",
+                                  inode->i_ino, err);
+        }
        ui->dirty = 0;
        mutex_unlock(&ui->ui_mutex);
@@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode)
 {
        int err;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
+        struct ubifs_inode *ui = ubifs_inode(inode);
-        if (ubifs_inode(inode)->xattr)
+        if (ui->xattr)
                /*
                 * Extended attribute inode deletions are fully handled in
                 * 'ubifs_removexattr()'. These inodes are special and have
@@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode)
                 */
                goto out;
-        dbg_gen("inode %lu", inode->i_ino);
+        dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
        ubifs_assert(!atomic_read(&inode->i_count));
        ubifs_assert(inode->i_nlink == 0);
@@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode)
        if (is_bad_inode(inode))
                goto out;
-        ubifs_inode(inode)->ui_size = inode->i_size = 0;
+        ui->ui_size = inode->i_size = 0;
-        err = ubifs_jnl_write_inode(c, inode, 1);
+        err = ubifs_jnl_delete_inode(c, inode);
        if (err)
                /*
                 * Worst case we have a lost orphan inode wasting space, so a
-                 * simple error message is ok here.
+                 * simple error message is OK here.
                 */
-                ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+                ubifs_err("can't delete inode %lu, error %d",
+                          inode->i_ino, err);
 out:
+        if (ui->dirty)
+                ubifs_release_dirty_inode_budget(c, ui);
        clear_inode(inode);
 }
@@ -358,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct ubifs_info *c = dentry->d_sb->s_fs_info;
        unsigned long long free;
+        __le32 *uuid = (__le32 *)c->uuid;
-        free = ubifs_budg_get_free_space(c);
+        free = ubifs_get_free_space(c);
        dbg_gen("free space %lld bytes (%lld blocks)",
                free, free >> UBIFS_BLOCK_SHIFT);
@@ -374,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = 0;
        buf->f_ffree = 0;
        buf->f_namelen = UBIFS_MAX_NLEN;
+        buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+        buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
        return 0;
 }
@@ -518,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)
        c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
        c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+        /*
+         * Calculate how many bytes would be wasted at the end of LEB if it was
+         * fully filled with data nodes of maximum size. This is used in
+         * calculations when reporting free space.
+         */
+        c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
        return 0;
 }
@@ -635,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)
         * internally because it does not make much sense for UBIFS, but it is
         * necessary to report something for the 'statfs()' call.
         *
-         * Subtract the LEB reserved for GC and the LEB which is reserved for
+         * Subtract the LEB reserved for GC, the LEB which is reserved for
-         * deletions.
+         * deletions, and assume only one journal head is available.
-         *
-         * Review 'ubifs_calc_available()' if changing this calculation.
         */
-        tmp64 = c->main_lebs - 2;
+        tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
-        tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+        tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
        tmp64 = ubifs_reported_space(c, tmp64);
        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
@@ -1122,8 +1140,8 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_infos;
-        ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
+        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
-                  c->vi.vol_id);
+                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
        if (mounted_read_only)
                ubifs_msg("mounted read-only");
        x = (long long)c->main_lebs * c->leb_size;
@@ -1469,6 +1487,7 @@ static void ubifs_put_super(struct super_block *sb)
         */
        ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
        ubifs_assert(c->budg_idx_growth == 0);
+        ubifs_assert(c->budg_dd_growth == 0);
        ubifs_assert(c->budg_data_growth == 0);
        /*
@@ -1657,7 +1676,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->orph_new);
        c->highest_inum = UBIFS_FIRST_INO;
-        get_random_bytes(&c->vfs_gen, sizeof(int));
        c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
        ubi_get_volume_info(ubi, &c->vi);
@@ -1671,10 +1689,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /*
-         * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+         * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
         * UBIFS, I/O is not deferred, it is done immediately in readpage,
         * which means the user would have to wait not just for their own I/O
-         * but the readahead I/O as well i.e. completely pointless.
+         * but the read-ahead I/O as well i.e. completely pointless.
         *
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
@@ -1841,7 +1859,7 @@ static struct file_system_type ubifs_fs_type = {
 /*
 * Inode slab cache constructor.
 */
-static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+static void inode_slab_ctor(void *obj)
 {
        struct ubifs_inode *ui = obj;
        inode_init_once(&ui->vfs_inode);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..7da209ab9378 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
                if (keys_cmp(c, key, &node_key) != 0)
                        ret = 0;
        }
-        if (ret == 0)
+        if (ret == 0 && c->replaying)
                dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
                        zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
        return ret;
@@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
 }
 /**
- * ubifs_tnc_lookup - look up a file-system node.
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
 * @c: UBIFS file-system description object
- * @key: node key to lookup
+ * @lnum: LEB number
- * @node: the node is returned here
+ * @gc_seq1: garbage collection sequence number
 *
- * This function look up and reads node with key @key. The caller has to make
+ * This function determines if @lnum may have been garbage collected since
- * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
- * of success, %-ENOENT if the node was not found, and a negative error code in
+ * %0 is returned.
- * case of failure.
 */
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
-                     void *node)
 {
-        int found, n, err;
+        int gc_seq2, gced_lnum;
-        struct ubifs_znode *znode;
-        struct ubifs_zbranch zbr, *zt;
-        mutex_lock(&c->tnc_mutex);
+        gced_lnum = c->gced_lnum;
-        found = ubifs_lookup_level0(c, key, &znode, &n);
+        smp_rmb();
-        if (!found) {
+        gc_seq2 = c->gc_seq;
-                err = -ENOENT;
+        /* Same seq means no GC */
-                goto out;
+        if (gc_seq1 == gc_seq2)
-        } else if (found < 0) {
+                return 0;
-                err = found;
+        /* Different by more than 1 means we don't know */
-                goto out;
+        if (gc_seq1 + 1 != gc_seq2)
-        }
+                return 1;
-        zt = &znode->zbranch[n];
+        /*
-        if (is_hash_key(c, key)) {
+         * We have seen the sequence number has increased by 1. Now we need to
-                /*
+         * be sure we read the right LEB number, so read it again.
-                 * In this case the leaf node cache gets used, so we pass the
+         */
-                 * address of the zbranch and keep the mutex locked
+        smp_rmb();
-                 */
+        if (gced_lnum != c->gced_lnum)
-                err = tnc_read_node_nm(c, zt, node);
+                return 1;
-                goto out;
+        /* Finally we can check lnum */
-        }
+        if (gced_lnum == lnum)
-        zbr = znode->zbranch[n];
+                return 1;
-        mutex_unlock(&c->tnc_mutex);
+        return 0;
-        err = ubifs_tnc_read_node(c, &zbr, node);
-        return err;
-out:
-        mutex_unlock(&c->tnc_mutex);
-        return err;
 }
 /**
@@ -1436,16 +1425,19 @@ out:
 * @lnum: LEB number is returned here
 * @offs: offset is returned here
 *
- * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
+ * This function look up and reads node with key @key. The caller has to make
- * location also. See 'ubifs_tnc_lookup()'.
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
 */
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
                     void *node, int *lnum, int *offs)
 {
-        int found, n, err;
+        int found, n, err, safely = 0, gc_seq1;
        struct ubifs_znode *znode;
        struct ubifs_zbranch zbr, *zt;
+again:
        mutex_lock(&c->tnc_mutex);
        found = ubifs_lookup_level0(c, key, &znode, &n);
        if (!found) {
@@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
                goto out;
        }
        zt = &znode->zbranch[n];
+        if (lnum) {
+                *lnum = zt->lnum;
+                *offs = zt->offs;
+        }
        if (is_hash_key(c, key)) {
                /*
                 * In this case the leaf node cache gets used, so we pass the
                 * address of the zbranch and keep the mutex locked
                 */
-                *lnum = zt->lnum;
-                *offs = zt->offs;
                err = tnc_read_node_nm(c, zt, node);
                goto out;
        }
+        if (safely) {
+                err = ubifs_tnc_read_node(c, zt, node);
+                goto out;
+        }
+        /* Drop the TNC mutex prematurely and race with garbage collection */
        zbr = znode->zbranch[n];
+        gc_seq1 = c->gc_seq;
        mutex_unlock(&c->tnc_mutex);
-        *lnum = zbr.lnum;
+        if (ubifs_get_wbuf(c, zbr.lnum)) {
-        *offs = zbr.offs;
+                /* We do not GC journal heads */
+                err = ubifs_tnc_read_node(c, &zbr, node);
+                return err;
+        }
-        err = ubifs_tnc_read_node(c, &zbr, node);
+        err = fallible_read_node(c, key, &zbr, node);
-        return err;
+        if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+                /*
+                 * The node may have been GC'ed out from under us so try again
+                 * while keeping the TNC mutex locked.
+                 */
+                safely = 1;
+                goto again;
+        }
+        return 0;
 out:
        mutex_unlock(&c->tnc_mutex);
@@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 {
        int found, n, err;
        struct ubifs_znode *znode;
-        struct ubifs_zbranch zbr;
        dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
        mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
                goto out_unlock;
        }
-        zbr = znode->zbranch[n];
+        err = tnc_read_node_nm(c, &znode->zbranch[n], node);
-        mutex_unlock(&c->tnc_mutex);
-        err = tnc_read_node_nm(c, &zbr, node);
-        return err;
 out_unlock:
        mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8117e65ba2e9..8ac76b1c2d55 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                written = layout_leb_in_gaps(c, p);
                if (written < 0) {
                        err = written;
-                        if (err == -ENOSPC) {
+                        if (err != -ENOSPC) {
-                                if (!dbg_force_in_the_gaps_enabled) {
+                                kfree(c->gap_lebs);
-                                        /*
+                                c->gap_lebs = NULL;
-                                         * Do not print scary warnings if the
+                                return err;
-                                         * debugging option which forces
-                                         * in-the-gaps is enabled.
-                                         */
-                                        ubifs_err("out of space");
-                                        spin_lock(&c->space_lock);
-                                        dbg_dump_budg(c);
-                                        spin_unlock(&c->space_lock);
-                                        dbg_dump_lprops(c);
-                                }
-                                /* Try to commit anyway */
-                                err = 0;
-                                break;
                        }
-                        kfree(c->gap_lebs);
+                        if (!dbg_force_in_the_gaps_enabled) {
-                        c->gap_lebs = NULL;
+                                /*
-                        return err;
+                                 * Do not print scary warnings if the debugging
+                                 * option which forces in-the-gaps is enabled.
+                                 */
+                                ubifs_err("out of space");
+                                spin_lock(&c->space_lock);
+                                dbg_dump_budg(c);
+                                spin_unlock(&c->space_lock);
+                                dbg_dump_lprops(c);
+                        }
+                        /* Try to commit anyway */
+                        err = 0;
+                        break;
                }
                p++;
                cnt -= written;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0cc7da9bed47..a9ecbd9af20d 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -87,7 +87,7 @@
 #define UBIFS_SK_LEN 8
 /* Minimum index tree fanout */
-#define UBIFS_MIN_FANOUT 2
+#define UBIFS_MIN_FANOUT 3
 /* Maximum number of levels in UBIFS indexing B-tree */
 #define UBIFS_MAX_LEVELS 512
@@ -228,10 +228,10 @@ enum {
 /* Minimum number of orphan area logical eraseblocks */
 #define UBIFS_MIN_ORPH_LEBS 1
 /*
- * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
 * for GC, 1 for deletions, and at least 1 for committed data).
 */
-#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
 /* Minimum number of logical eraseblocks */
 #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e4f89f271827..17c620b93eec 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -20,8 +20,6 @@
 *          Adrian Hunter
 */
-/* Implementation version 0.7 */
 #ifndef __UBIFS_H__
 #define __UBIFS_H__
@@ -322,6 +320,8 @@ struct ubifs_gced_idx_leb {
 * struct ubifs_inode - UBIFS in-memory inode description.
 * @vfs_inode: VFS inode description object
 * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ *             protected by @c->commit_sem;
 * @xattr_size: summarized size of all extended attributes in bytes
 * @xattr_cnt: count of extended attributes this inode has
 * @xattr_names: sum of lengths of all extended attribute names belonging to
@@ -373,6 +373,7 @@ struct ubifs_gced_idx_leb {
 struct ubifs_inode {
        struct inode vfs_inode;
        unsigned long long creat_sqnum;
+        unsigned long long del_cmtno;
        unsigned int xattr_size;
        unsigned int xattr_cnt;
        unsigned int xattr_names;
@@ -779,7 +780,7 @@ struct ubifs_compressor {
 /**
 * struct ubifs_budget_req - budget requirements of an operation.
 *
- * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
 *        should not try to call write-back
 * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
 *               have to be re-calculated
@@ -805,21 +806,31 @@ struct ubifs_compressor {
 * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
 * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
 * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
 */
 struct ubifs_budget_req {
        unsigned int fast:1;
        unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
        unsigned int new_page:1;
        unsigned int dirtied_page:1;
        unsigned int new_dent:1;
        unsigned int mod_dent:1;
        unsigned int new_ino:1;
        unsigned int new_ino_d:13;
-#ifndef UBIFS_DEBUG
        unsigned int dirtied_ino:4;
        unsigned int dirtied_ino_d:15;
 #else
        /* Not bit-fields to check for overflows */
+        unsigned int new_page;
+        unsigned int dirtied_page;
+        unsigned int new_dent;
+        unsigned int mod_dent;
+        unsigned int new_ino;
+        unsigned int new_ino_d;
        unsigned int dirtied_ino;
        unsigned int dirtied_ino_d;
 #endif
@@ -860,13 +871,13 @@ struct ubifs_mount_opts {
 * struct ubifs_info - UBIFS file-system description data structure
 * (per-superblock).
 * @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable readahead
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
 *
 * @highest_inum: highest used inode number
- * @vfs_gen: VFS inode generation counter
 * @max_sqnum: current global sequence number
- * @cmt_no: commit number (last successfully completed commit)
+ * @cmt_no: commit number of the last successfully completed commit, protected
- * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ *          by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
 * @fmt_version: UBIFS on-flash format version
 * @uuid: UUID from super block
 *
@@ -984,6 +995,9 @@ struct ubifs_mount_opts {
 * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
 * @max_inode_sz: maximum possible inode size in bytes
 * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
 * @dead_wm: LEB dead space watermark
 * @dark_wm: LEB dark space watermark
 * @block_cnt: count of 4KiB blocks on the FS
@@ -1017,6 +1031,8 @@ struct ubifs_mount_opts {
 * @sbuf: a buffer of LEB size used by GC and replay for scanning
 * @idx_gc: list of index LEBs that have been garbage collected
 * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
 *
 * @infos_list: links all 'ubifs_info' objects
 * @umount_mutex: serializes shrinker and un-mount
@@ -1103,7 +1119,6 @@ struct ubifs_info {
        struct backing_dev_info bdi;
        ino_t highest_inum;
-        unsigned int vfs_gen;
        unsigned long long max_sqnum;
        unsigned long long cmt_no;
        spinlock_t cnt_lock;
@@ -1214,6 +1229,8 @@ struct ubifs_info {
        int max_idx_node_sz;
        long long max_inode_sz;
        int max_znode_sz;
+        int leb_overhead;
        int dead_wm;
        int dark_wm;
        int block_cnt;
@@ -1247,6 +1264,8 @@ struct ubifs_info {
        void *sbuf;
        struct list_head idx_gc;
        int idx_gc_cnt;
+        volatile int gc_seq;
+        volatile int gced_lnum;
        struct list_head infos_list;
        struct mutex umount_mutex;
@@ -1346,6 +1365,7 @@ extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 /* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
                           int dtype);
@@ -1399,8 +1419,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
                     int deletion, int xent);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                         const union ubifs_key *key, const void *buf, int len);
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
-                          int last_reference);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
                     const struct dentry *old_dentry,
                     const struct inode *new_dir,
@@ -1423,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
                                struct ubifs_budget_req *req);
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
                         struct ubifs_budget_req *req);
-long long ubifs_budg_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
@@ -1440,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
 /* tnc.c */
 int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
                        struct ubifs_znode **zn, int *n);
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-                     void *node);
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
                        void *node, const struct qstr *nm);
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 1388a078e1a9..649bec78b645 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -61,7 +61,7 @@
 /*
 * Limit the number of extended attributes per inode so that the total size
- * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
 */
 #define MAX_XATTRS_PER_INODE 65535
@@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        struct inode *inode;
        struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-                                        .new_ino_d = size, .dirtied_ino = 1,
+                                .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
-                                        .dirtied_ino_d = host_ui->data_len};
+                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
                return -ENOSPC;
        /*
         * Linux limits the maximum size of the extended attribute names list
-         * to %XATTR_LIST_MAX. This means we should not allow creating more*
+         * to %XATTR_LIST_MAX. This means we should not allow creating more
         * extended attributes if the name list becomes larger. This limitation
         * is artificial for UBIFS, though.
         */
@@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
                goto out_budg;
        }
-        mutex_lock(&host_ui->ui_mutex);
        /* Re-define all operations to be "nothing" */
        inode->i_mapping->a_ops = &none_address_operations;
        inode->i_op = &none_inode_operations;
@@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        ui->data = kmalloc(size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
-                goto out_unlock;
+                goto out_free;
        }
        memcpy(ui->data, value, size);
+        inode->i_size = ui->ui_size = size;
+        ui->data_len = size;
+        mutex_lock(&host_ui->ui_mutex);
        host->i_ctime = ubifs_current_time(host);
        host_ui->xattr_cnt += 1;
        host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        host_ui->xattr_names += nm->len;
-        /*
-         * We do not use i_size_write() because nobody can race with us as we
-         * are holding host @host->i_mutex - every xattr operation for this
-         * inode is serialized by it.
-         */
-        inode->i_size = ui->ui_size = size;
-        ui->data_len = size;
        err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
        if (err)
                goto out_cancel;
@@ -172,8 +167,8 @@ out_cancel:
        host_ui->xattr_cnt -= 1;
        host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
-out_unlock:
        mutex_unlock(&host_ui->ui_mutex);
+out_free:
        make_bad_inode(inode);
        iput(inode);
 out_budg:
@@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_inode *ui = ubifs_inode(inode);
        struct ubifs_budget_req req = { .dirtied_ino = 2,
-                                .dirtied_ino_d = size + host_ui->data_len };
+                .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
        ubifs_assert(ui->data_len == inode->i_size);
        err = ubifs_budget_space(c, &req);
        if (err)
                return err;
-        mutex_lock(&host_ui->ui_mutex);
-        host->i_ctime = ubifs_current_time(host);
-        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
-        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        kfree(ui->data);
        ui->data = kmalloc(size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
-                goto out_unlock;
+                goto out_free;
        }
        memcpy(ui->data, value, size);
        inode->i_size = ui->ui_size = size;
        ui->data_len = size;
+        mutex_lock(&host_ui->ui_mutex);
+        host->i_ctime = ubifs_current_time(host);
+        host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+        host_ui->xattr_size += CALC_XATTR_BYTES(size);
        /*
         * It is important to write the host inode after the xattr inode
         * because if the host inode gets synchronized (via 'fsync()'), then
@@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 out_cancel:
        host_ui->xattr_size -= CALC_XATTR_BYTES(size);
        host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
-        make_bad_inode(inode);
-out_unlock:
        mutex_unlock(&host_ui->ui_mutex);
+        make_bad_inode(inode);
+out_free:
        ubifs_release_budget(c, &req);
        return err;
 }
@@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
        dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
                host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+        ubifs_assert(mutex_is_locked(&host->i_mutex));
        if (size > UBIFS_MAX_INO_DATA)
                return -ERANGE;
@@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
        if (!xent)
                return -ENOMEM;
-        mutex_lock(&host->i_mutex);
        xent_key_init(c, &key, host->i_ino, &nm);
        err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
        if (err) {
@@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 out_iput:
        iput(inode);
 out_unlock:
-        mutex_unlock(&host->i_mutex);
        kfree(xent);
        return err;
 }
@@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                return -ERANGE;
        lowest_xent_key(c, &key, host->i_ino);
-        mutex_lock(&host->i_mutex);
        while (1) {
                int type;
@@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                pxent = xent;
                key_read(c, &xent->key, &key);
        }
-        mutex_unlock(&host->i_mutex);
        kfree(pxent);
        if (err != -ENOENT) {
@@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
        int err;
        struct ubifs_inode *host_ui = ubifs_inode(host);
        struct ubifs_inode *ui = ubifs_inode(inode);
-        struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
+        struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
-                                        .dirtied_ino_d = host_ui->data_len };
+                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
        ubifs_assert(ui->data_len == inode->i_size);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
        .release                = udf_release_file,
        .fsync                  = udf_fsync_file,
        .splice_read            = generic_file_splice_read,
+        .llseek                 = generic_file_llseek,
 };
 const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        *err = -ENOSPC;
        iinfo = UDF_I(inode);
-        iinfo->i_unique = 0;
+        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-        iinfo->i_lenExtents = 0;
+                iinfo->i_efe = 1;
-        iinfo->i_next_alloc_block = 0;
+                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-        iinfo->i_next_alloc_goal = 0;
+                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-        iinfo->i_strat4096 = 0;
+                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+                                            sizeof(struct extendedFileEntry),
+                                            GFP_KERNEL);
+        } else {
+                iinfo->i_efe = 0;
+                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+                                            sizeof(struct fileEntry),
+                                            GFP_KERNEL);
+        }
+        if (!iinfo->i_ext.i_data) {
+                iput(inode);
+                *err = -ENOMEM;
+                return NULL;
+        }
        block = udf_new_block(dir->i_sb, NULL,
                              dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                lvhd->uniqueID = cpu_to_le64(uniqueID);
                mark_buffer_dirty(sbi->s_lvid_bh);
        }
+        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
        inode->i_uid = current->fsuid;
        if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
        iinfo->i_use = 0;
-        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-                iinfo->i_efe = 1;
-                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-                                            sizeof(struct extendedFileEntry),
-                                            GFP_KERNEL);
-        } else {
-                iinfo->i_efe = 0;
-                iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-                                            sizeof(struct fileEntry),
-                                            GFP_KERNEL);
-        }
-        if (!iinfo->i_ext.i_data) {
-                iput(inode);
-                *err = -ENOMEM;
-                mutex_unlock(&sbi->s_alloc_mutex);
-                return NULL;
-        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
        else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                iinfo->i_crtime = current_fs_time(inode->i_sb);
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        mutex_unlock(&sbi->s_alloc_mutex);
        if (DQUOT_ALLOC_INODE(inode)) {
                DQUOT_DROP(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44cc702f96cc..5698bbf83bbf 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
        kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct udf_inode_info *ei = (struct udf_inode_info *)foo;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 506f724055c2..3141969b456d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -76,6 +76,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -1232,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
        unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
-        const struct match_token *tp = tokens;
+        struct match_token *tp = tokens;
        while (tp->token != Opt_onerror_panic && tp->token != mval)
                ++tp;
@@ -1301,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
        kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
        struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/utimes.c b/fs/utimes.c
index b6b664e7145e..6929e3e91d05 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -48,66 +48,22 @@ static bool nsec_valid(long nsec)
        return nsec >= 0 && nsec <= 999999999;
 }
-/* If times==NULL, set access and modification to current time,
+static int utimes_common(struct path *path, struct timespec *times)
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
 {
        int error;
-        struct nameidata nd;
-        struct dentry *dentry;
-        struct inode *inode;
        struct iattr newattrs;
-        struct file *f = NULL;
+        struct inode *inode = path->dentry->d_inode;
-        struct vfsmount *mnt;
-        error = -EINVAL;
-        if (times && (!nsec_valid(times[0].tv_nsec) ||
-                      !nsec_valid(times[1].tv_nsec))) {
-                goto out;
-        }
-        if (flags & ~AT_SYMLINK_NOFOLLOW)
-                goto out;
-        if (filename == NULL && dfd != AT_FDCWD) {
-                error = -EINVAL;
-                if (flags & AT_SYMLINK_NOFOLLOW)
-                        goto out;
-                error = -EBADF;
+        error = mnt_want_write(path->mnt);
-                f = fget(dfd);
-                if (!f)
-                        goto out;
-                dentry = f->f_path.dentry;
-                mnt = f->f_path.mnt;
-        } else {
-                error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
-                if (error)
-                        goto out;
-                dentry = nd.path.dentry;
-                mnt = nd.path.mnt;
-        }
-        inode = dentry->d_inode;
-        error = mnt_want_write(mnt);
        if (error)
-                goto dput_and_out;
+                goto out;
        if (times && times[0].tv_nsec == UTIME_NOW &&
                     times[1].tv_nsec == UTIME_NOW)
                times = NULL;
-        /* In most cases, the checks are done in inode_change_ok() */
        newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
        if (times) {
-                error = -EPERM;
-                if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                        goto mnt_drop_write_and_out;
                if (times[0].tv_nsec == UTIME_OMIT)
                        newattrs.ia_valid &= ~ATTR_ATIME;
                else if (times[0].tv_nsec != UTIME_NOW) {
@@ -123,21 +79,13 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
                        newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
                        newattrs.ia_valid |= ATTR_MTIME_SET;
                }
                /*
-                 * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
+                 * Tell inode_change_ok(), that this is an explicit time
-                 * cases, we need to make an extra check that is not done by
+                 * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
-                 * inode_change_ok().
+                 * were used.
                 */
-                if (((times[0].tv_nsec == UTIME_NOW &&
+                newattrs.ia_valid |= ATTR_TIMES_SET;
-                            times[1].tv_nsec == UTIME_OMIT)
-                     ||
-                     (times[0].tv_nsec == UTIME_OMIT &&
-                            times[1].tv_nsec == UTIME_NOW))
-                    && !is_owner_or_cap(inode))
-                        goto mnt_drop_write_and_out;
        } else {
                /*
                 * If times is NULL (or both times are UTIME_NOW),
                 * then we need to check permissions, because
@@ -148,21 +96,76 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
                        goto mnt_drop_write_and_out;
                if (!is_owner_or_cap(inode)) {
-                        error = permission(inode, MAY_WRITE, NULL);
+                        error = inode_permission(inode, MAY_WRITE);
                        if (error)
                                goto mnt_drop_write_and_out;
                }
        }
        mutex_lock(&inode->i_mutex);
-        error = notify_change(dentry, &newattrs);
+        error = notify_change(path->dentry, &newattrs);
        mutex_unlock(&inode->i_mutex);
 mnt_drop_write_and_out:
-        mnt_drop_write(mnt);
+        mnt_drop_write(path->mnt);
-dput_and_out:
+out:
-        if (f)
+        return error;
-                fput(f);
+}
-        else
-                path_put(&nd.path);
+/*
+ * do_utimes - change times on filename or file descriptor
+ * @dfd: open file descriptor, -1 or AT_FDCWD
+ * @filename: path name or NULL
+ * @times: new times or NULL
+ * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
+ *
+ * If filename is NULL and dfd refers to an open file, then operate on
+ * the file.  Otherwise look up filename, possibly using dfd as a
+ * starting point.
+ *
+ * If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+{
+        int error = -EINVAL;
+        if (times && (!nsec_valid(times[0].tv_nsec) ||
+                      !nsec_valid(times[1].tv_nsec))) {
+                goto out;
+        }
+        if (flags & ~AT_SYMLINK_NOFOLLOW)
+                goto out;
+        if (filename == NULL && dfd != AT_FDCWD) {
+                struct file *file;
+                if (flags & AT_SYMLINK_NOFOLLOW)
+                        goto out;
+                file = fget(dfd);
+                error = -EBADF;
+                if (!file)
+                        goto out;
+                error = utimes_common(&file->f_path, times);
+                fput(file);
+        } else {
+                struct path path;
+                int lookup_flags = 0;
+                if (!(flags & AT_SYMLINK_NOFOLLOW))
+                        lookup_flags |= LOOKUP_FOLLOW;
+                error = user_path_at(dfd, filename, lookup_flags, &path);
+                if (error)
+                        goto out;
+                error = utimes_common(&path, times);
+                path_put(&path);
+        }
 out:
        return error;
 }
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index b546ba69be82..155c10b4adbd 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -621,7 +621,7 @@ shortname:
        memcpy(de->name, msdos_name, MSDOS_NAME);
        de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
        de->lcase = lcase;
-        fat_date_unix2dos(ts->tv_sec, &time, &date);
+        fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
        de->time = de->ctime = time;
        de->date = de->cdate = de->adate = date;
        de->ctime_cs = 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4706a8b1f495..468377e66531 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -63,7 +63,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
                        return -EPERM;
        }
-        return permission(inode, mask, NULL);
+        return inode_permission(inode, mask);
 }
 int
@@ -252,40 +252,40 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 }
 asmlinkage long
-sys_setxattr(const char __user *path, const char __user *name,
+sys_setxattr(const char __user *pathname, const char __user *name,
             const void __user *value, size_t size, int flags)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (error)
                return error;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (!error) {
-                error = setxattr(nd.path.dentry, name, value, size, flags);
+                error = setxattr(path.dentry, name, value, size, flags);
-                mnt_drop_write(nd.path.mnt);
+                mnt_drop_write(path.mnt);
        }
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
 asmlinkage long
-sys_lsetxattr(const char __user *path, const char __user *name,
+sys_lsetxattr(const char __user *pathname, const char __user *name,
              const void __user *value, size_t size, int flags)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk_link(path, &nd);
+        error = user_lpath(pathname, &path);
        if (error)
                return error;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (!error) {
-                error = setxattr(nd.path.dentry, name, value, size, flags);
+                error = setxattr(path.dentry, name, value, size, flags);
-                mnt_drop_write(nd.path.mnt);
+                mnt_drop_write(path.mnt);
        }
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
@@ -350,32 +350,32 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 }
 asmlinkage ssize_t
-sys_getxattr(const char __user *path, const char __user *name,
+sys_getxattr(const char __user *pathname, const char __user *name,
             void __user *value, size_t size)
 {
-        struct nameidata nd;
+        struct path path;
        ssize_t error;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (error)
                return error;
-        error = getxattr(nd.path.dentry, name, value, size);
+        error = getxattr(path.dentry, name, value, size);
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
 asmlinkage ssize_t
-sys_lgetxattr(const char __user *path, const char __user *name, void __user *value,
+sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
              size_t size)
 {
-        struct nameidata nd;
+        struct path path;
        ssize_t error;
-        error = user_path_walk_link(path, &nd);
+        error = user_lpath(pathname, &path);
        if (error)
                return error;
-        error = getxattr(nd.path.dentry, name, value, size);
+        error = getxattr(path.dentry, name, value, size);
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
@@ -425,30 +425,30 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 }
 asmlinkage ssize_t
-sys_listxattr(const char __user *path, char __user *list, size_t size)
+sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 {
-        struct nameidata nd;
+        struct path path;
        ssize_t error;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (error)
                return error;
-        error = listxattr(nd.path.dentry, list, size);
+        error = listxattr(path.dentry, list, size);
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
 asmlinkage ssize_t
-sys_llistxattr(const char __user *path, char __user *list, size_t size)
+sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 {
-        struct nameidata nd;
+        struct path path;
        ssize_t error;
-        error = user_path_walk_link(path, &nd);
+        error = user_lpath(pathname, &path);
        if (error)
                return error;
-        error = listxattr(nd.path.dentry, list, size);
+        error = listxattr(path.dentry, list, size);
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
@@ -486,38 +486,38 @@ removexattr(struct dentry *d, const char __user *name)
 }
 asmlinkage long
-sys_removexattr(const char __user *path, const char __user *name)
+sys_removexattr(const char __user *pathname, const char __user *name)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk(path, &nd);
+        error = user_path(pathname, &path);
        if (error)
                return error;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (!error) {
-                error = removexattr(nd.path.dentry, name);
+                error = removexattr(path.dentry, name);
-                mnt_drop_write(nd.path.mnt);
+                mnt_drop_write(path.mnt);
        }
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
 asmlinkage long
-sys_lremovexattr(const char __user *path, const char __user *name)
+sys_lremovexattr(const char __user *pathname, const char __user *name)
 {
-        struct nameidata nd;
+        struct path path;
        int error;
-        error = user_path_walk_link(path, &nd);
+        error = user_lpath(pathname, &path);
        if (error)
                return error;
-        error = mnt_want_write(nd.path.mnt);
+        error = mnt_want_write(path.mnt);
        if (!error) {
-                error = removexattr(nd.path.dentry, name);
+                error = removexattr(path.dentry, name);
-                mnt_drop_write(nd.path.mnt);
+                mnt_drop_write(path.mnt);
        }
-        path_put(&nd.path);
+        path_put(&path);
        return error;
 }
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 36ec614e699a..737c9a425361 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,7 +106,8 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
-                                   xfs_vnode.o)
+                                   xfs_vnode.o \
+                                   xfs_xattr.o)
 # Objects in support/
 xfs-y                           += $(addprefix support/, \
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 9b1bb17a0501..1cd3b55ee3d2 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -90,7 +90,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
 }
 void
-kmem_free(void *ptr, size_t size)
+kmem_free(const void *ptr)
 {
        if (!is_vmalloc_addr(ptr)) {
                kfree(ptr);
@@ -100,7 +100,7 @@ kmem_free(void *ptr, size_t size)
 }
 void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
             unsigned int __nocast flags)
 {
        void    *new;
@@ -110,7 +110,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
                if (new)
                        memcpy(new, ptr,
                                ((oldsize < newsize) ? oldsize : newsize));
-                kmem_free(ptr, oldsize);
+                kmem_free(ptr);
        }
        return new;
 }
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e9564902976..af6843c7ee4b 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -57,8 +57,8 @@ kmem_flags_convert(unsigned int __nocast flags)
 extern void *kmem_alloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
-extern void  kmem_free(void *, size_t);
+extern void  kmem_free(const void *);
 /*
 * Zone interfaces
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-                     void (*construct)(kmem_zone_t *, void *))
+                     void (*construct)(void *))
 {
        return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 3abe7e9ceb33..000000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <linux/semaphore.h>
-#include <asm/atomic.h>
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-typedef struct semaphore sema_t;
-#define initnsema(sp, val, name)        sema_init(sp, val)
-#define psema(sp, b)                    down(sp)
-#define vsema(sp)                       up(sp)
-#define freesema(sema)                  do { } while (0)
-static inline int issemalocked(sema_t *sp)
-{
-        return down_trylock(sp) || (up(sp), 0);
-}
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-static inline int cpsema(sema_t *sp)
-{
-        return down_trylock(sp) ? 0 : 1;
-}
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a55c3b26d840..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -73,7 +73,6 @@ xfs_page_trace(
        unsigned long   pgoff)
 {
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp = vn_from_inode(inode);
        loff_t          isize = i_size_read(inode);
        loff_t          offset = page_offset(page);
        int             delalloc = -1, unmapped = -1, unwritten = -1;
@@ -81,7 +80,7 @@ xfs_page_trace(
        if (page_has_buffers(page))
                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-        ip = xfs_vtoi(vp);
+        ip = XFS_I(inode);
        if (!ip->i_rwtrace)
                return;
@@ -409,7 +408,6 @@ xfs_start_buffer_writeback(
 STATIC void
 xfs_start_page_writeback(
        struct page             *page,
-        struct writeback_control *wbc,
        int                     clear_dirty,
        int                     buffers)
 {
@@ -676,7 +674,7 @@ xfs_probe_cluster(
                        } else
                                pg_offset = PAGE_CACHE_SIZE;
-                        if (page->index == tindex && !TestSetPageLocked(page)) {
+                        if (page->index == tindex && trylock_page(page)) {
                                pg_len = xfs_probe_page(page, pg_offset, mapped);
                                unlock_page(page);
                        }
@@ -760,7 +758,7 @@ xfs_convert_page(
        if (page->index != tindex)
                goto fail;
-        if (TestSetPageLocked(page))
+        if (!trylock_page(page))
                goto fail;
        if (PageWriteback(page))
                goto fail_unlock_page;
@@ -858,7 +856,7 @@ xfs_convert_page(
                                done = 1;
                        }
                }
-                xfs_start_page_writeback(page, wbc, !page_dirty, count);
+                xfs_start_page_writeback(page, !page_dirty, count);
        }
        return done;
@@ -1105,7 +1103,7 @@ xfs_page_state_convert(
                         * that we are writing into for the first time.
                         */
                        type = IOMAP_NEW;
-                        if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
+                        if (trylock_buffer(bh)) {
                                ASSERT(buffer_mapped(bh));
                                if (iomap_valid)
                                        all_bh = 1;
@@ -1130,7 +1128,7 @@ xfs_page_state_convert(
                SetPageUptodate(page);
        if (startio)
-                xfs_start_page_writeback(page, wbc, 1, count);
+                xfs_start_page_writeback(page, 1, count);
        if (ioend && iomap_valid) {
                offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
@@ -1340,6 +1338,10 @@ __xfs_get_blocks(
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
        size = bh_result->b_size;
+        if (!create && direct && offset >= i_size_read(inode))
+                return 0;
        error = xfs_iomap(XFS_I(inode), offset, size,
                             create ? flags : BMAPI_READ, &iomap, &niomap);
        if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 98e0e86093b4..986061ae1b9b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -58,7 +58,7 @@ xfs_buf_trace(
                bp, id,
                (void *)(unsigned long)bp->b_flags,
                (void *)(unsigned long)bp->b_hold.counter,
-                (void *)(unsigned long)bp->b_sema.count.counter,
+                (void *)(unsigned long)bp->b_sema.count,
                (void *)current,
                data, ra,
                (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
@@ -253,7 +253,7 @@ _xfs_buf_initialize(
        memset(bp, 0, sizeof(xfs_buf_t));
        atomic_set(&bp->b_hold, 1);
-        init_MUTEX_LOCKED(&bp->b_iodonesema);
+        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_list);
        INIT_LIST_HEAD(&bp->b_hash_list);
        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
@@ -310,8 +310,7 @@ _xfs_buf_free_pages(
        xfs_buf_t       *bp)
 {
        if (bp->b_pages != bp->b_page_array) {
-                kmem_free(bp->b_pages,
+                kmem_free(bp->b_pages);
-                          bp->b_page_count * sizeof(struct page *));
        }
 }
@@ -839,6 +838,7 @@ xfs_buf_rele(
                return;
        }
+        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
                if (bp->b_relse) {
                        atomic_inc(&bp->b_hold);
@@ -852,11 +852,6 @@ xfs_buf_rele(
                        spin_unlock(&hash->bh_lock);
                        xfs_buf_free(bp);
                }
-        } else {
-                /*
-                 * Catch reference count leaks
-                 */
-                ASSERT(atomic_read(&bp->b_hold) >= 0);
        }
 }
@@ -1038,7 +1033,7 @@ xfs_buf_ioend(
                        xfs_buf_iodone_work(&bp->b_iodone_work);
                }
        } else {
-                up(&bp->b_iodonesema);
+                complete(&bp->b_iowait);
        }
 }
@@ -1276,7 +1271,7 @@ xfs_buf_iowait(
        XB_TRACE(bp, "iowait", 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
-        down(&bp->b_iodonesema);
+        wait_for_completion(&bp->b_iowait);
        XB_TRACE(bp, "iowaited", (long)bp->b_error);
        return bp->b_error;
 }
@@ -1398,7 +1393,7 @@ STATIC void
 xfs_free_bufhash(
        xfs_buftarg_t           *btp)
 {
-        kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
+        kmem_free(btp->bt_hash);
        btp->bt_hash = NULL;
 }
@@ -1428,13 +1423,10 @@ xfs_unregister_buftarg(
 void
 xfs_free_buftarg(
-        xfs_buftarg_t           *btp,
+        xfs_buftarg_t           *btp)
-        int                     external)
 {
        xfs_flush_buftarg(btp, 1);
        xfs_blkdev_issue_flush(btp);
-        if (external)
-                xfs_blkdev_put(btp->bt_bdev);
        xfs_free_bufhash(btp);
        iput(btp->bt_mapping->host);
@@ -1444,7 +1436,7 @@ xfs_free_buftarg(
        xfs_unregister_buftarg(btp);
        kthread_stop(btp->bt_task);
-        kmem_free(btp, sizeof(*btp));
+        kmem_free(btp);
 }
 STATIC int
@@ -1575,7 +1567,7 @@ xfs_alloc_buftarg(
        return btp;
 error:
-        kmem_free(btp, sizeof(*btp));
+        kmem_free(btp);
        return NULL;
 }
@@ -1803,7 +1795,7 @@ int __init
 xfs_buf_init(void)
 {
 #ifdef XFS_BUF_TRACE
-        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
+        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
 #endif
        xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index f948ec7ba9a4..fe0109956656 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -157,7 +157,7 @@ typedef struct xfs_buf {
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
        xfs_buf_relse_t         b_relse;        /* releasing function */
        xfs_buf_bdstrat_t       b_strat;        /* pre-write function */
-        struct semaphore        b_iodonesema;   /* Semaphore for I/O waiters */
+        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
        void                    *b_fspriv3;
@@ -352,7 +352,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_CPSEMA(bp)      (xfs_buf_cond_lock(bp) == 0)
 #define XFS_BUF_VSEMA(bp)       xfs_buf_unlock(bp)
 #define XFS_BUF_PSEMA(bp,x)     xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
+#define XFS_BUF_FINISH_IOWAIT(bp)       complete(&bp->b_iowait);
 #define XFS_BUF_SET_TARGET(bp, target)  ((bp)->b_target = (target))
 #define XFS_BUF_TARGET(bp)              ((bp)->b_target)
@@ -429,7 +429,7 @@ static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
 *      Handling of buftargs.
 */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *, int);
+extern void xfs_free_buftarg(xfs_buftarg_t *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index c672b3238b14..24fd598af846 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -139,7 +139,7 @@ xfs_nfs_get_inode(
        }
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        return ip->i_vnode;
+        return VFS_I(ip);
 }
 STATIC struct dentry *
@@ -167,7 +167,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
        if (!inode)
                return NULL;
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return ERR_CAST(inode);
        result = d_alloc_anon(inode);
        if (!result) {
                iput(inode);
@@ -198,7 +198,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
        if (!inode)
                return NULL;
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return ERR_CAST(inode);
        result = d_alloc_anon(inode);
        if (!result) {
                iput(inode);
@@ -215,13 +215,13 @@ xfs_fs_get_parent(
        struct xfs_inode        *cip;
        struct dentry           *parent;
-        error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
+        error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
        if (unlikely(error))
                return ERR_PTR(-error);
-        parent = d_alloc_anon(cip->i_vnode);
+        parent = d_alloc_anon(VFS_I(cip));
        if (unlikely(!parent)) {
-                iput(cip->i_vnode);
+                iput(VFS_I(cip));
                return ERR_PTR(-ENOMEM);
        }
        return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5f60363b9343..5311c1acdd40 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = {
 const struct file_operations xfs_dir_file_operations = {
        .read           = generic_read_dir,
        .readdir        = xfs_file_readdir,
+        .llseek         = generic_file_llseek,
        .unlocked_ioctl = xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = xfs_file_compat_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1eefe61f0e10..36caa6d957df 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -31,7 +31,7 @@ xfs_tosspages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        if (mapping->nrpages)
                truncate_inode_pages(mapping, first);
@@ -44,7 +44,7 @@ xfs_flushinval_pages(
        xfs_off_t       last,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
        if (mapping->nrpages) {
@@ -64,7 +64,7 @@ xfs_flush_pages(
        uint64_t        flags,
        int             fiopt)
 {
-        struct address_space *mapping = ip->i_vnode->i_mapping;
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
        int             ret = 0;
        int             ret2;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a42ba9d71156..48799ba7e3e6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -48,6 +48,8 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_quota.h"
+#include "xfs_inode_item.h"
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -84,17 +86,15 @@ xfs_find_handle(
        switch (cmd) {
        case XFS_IOC_PATH_TO_FSHANDLE:
        case XFS_IOC_PATH_TO_HANDLE: {
-                struct nameidata        nd;
+                struct path path;
-                int                     error;
+                int error = user_lpath((const char __user *)hreq.path, &path);
-                error = user_path_walk_link((const char __user *)hreq.path, &nd);
                if (error)
                        return error;
-                ASSERT(nd.path.dentry);
+                ASSERT(path.dentry);
-                ASSERT(nd.path.dentry->d_inode);
+                ASSERT(path.dentry->d_inode);
-                inode = igrab(nd.path.dentry->d_inode);
+                inode = igrab(path.dentry->d_inode);
-                path_put(&nd.path);
+                path_put(&path);
                break;
        }
@@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        *inode = XFS_ITOV(ip);
+        *inode = VFS_I(ip);
        return 0;
 }
@@ -470,6 +470,12 @@ xfs_attrlist_by_handle(
        if (al_hreq.buflen > XATTR_LIST_MAX)
                return -XFS_ERROR(EINVAL);
+        /*
+         * Reject flags, only allow namespaces.
+         */
+        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+                return -XFS_ERROR(EINVAL);
        error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode);
        if (error)
                goto out;
@@ -589,7 +595,7 @@ xfs_attrmulti_by_handle(
                goto out;
        error = E2BIG;
-        size = am_hreq.opcount * sizeof(attr_multiop_t);
+        size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
                goto out_vn_rele;
@@ -682,9 +688,9 @@ xfs_ioc_space(
                return -XFS_ERROR(EFAULT);
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                attr_flags |= ATTR_NONBLOCK;
+                attr_flags |= XFS_ATTR_NONBLOCK;
        if (ioflags & IO_INVIS)
-                attr_flags |= ATTR_DMI;
+                attr_flags |= XFS_ATTR_DMI;
        error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
                                              NULL, attr_flags);
@@ -875,6 +881,322 @@ xfs_ioc_fsgetxattr(
        return 0;
 }
+STATIC void
+xfs_set_diflags(
+        struct xfs_inode        *ip,
+        unsigned int            xflags)
+{
+        unsigned int            di_flags;
+        /* can't set PREALLOC this way, just preserve it */
+        di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+        if (xflags & XFS_XFLAG_IMMUTABLE)
+                di_flags |= XFS_DIFLAG_IMMUTABLE;
+        if (xflags & XFS_XFLAG_APPEND)
+                di_flags |= XFS_DIFLAG_APPEND;
+        if (xflags & XFS_XFLAG_SYNC)
+                di_flags |= XFS_DIFLAG_SYNC;
+        if (xflags & XFS_XFLAG_NOATIME)
+                di_flags |= XFS_DIFLAG_NOATIME;
+        if (xflags & XFS_XFLAG_NODUMP)
+                di_flags |= XFS_DIFLAG_NODUMP;
+        if (xflags & XFS_XFLAG_PROJINHERIT)
+                di_flags |= XFS_DIFLAG_PROJINHERIT;
+        if (xflags & XFS_XFLAG_NODEFRAG)
+                di_flags |= XFS_DIFLAG_NODEFRAG;
+        if (xflags & XFS_XFLAG_FILESTREAM)
+                di_flags |= XFS_DIFLAG_FILESTREAM;
+        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                if (xflags & XFS_XFLAG_RTINHERIT)
+                        di_flags |= XFS_DIFLAG_RTINHERIT;
+                if (xflags & XFS_XFLAG_NOSYMLINKS)
+                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
+                if (xflags & XFS_XFLAG_EXTSZINHERIT)
+                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+                if (xflags & XFS_XFLAG_REALTIME)
+                        di_flags |= XFS_DIFLAG_REALTIME;
+                if (xflags & XFS_XFLAG_EXTSIZE)
+                        di_flags |= XFS_DIFLAG_EXTSIZE;
+        }
+        ip->i_d.di_flags = di_flags;
+}
+STATIC void
+xfs_diflags_to_linux(
+        struct xfs_inode        *ip)
+{
+        struct inode            *inode = VFS_I(ip);
+        unsigned int            xflags = xfs_ip2xflags(ip);
+        if (xflags & XFS_XFLAG_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (xflags & XFS_XFLAG_APPEND)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (xflags & XFS_XFLAG_SYNC)
+                inode->i_flags |= S_SYNC;
+        else
+                inode->i_flags &= ~S_SYNC;
+        if (xflags & XFS_XFLAG_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        else
+                inode->i_flags &= ~S_NOATIME;
+}
+#define FSX_PROJID      1
+#define FSX_EXTSIZE     2
+#define FSX_XFLAGS      4
+#define FSX_NONBLOCK    8
+STATIC int
+xfs_ioctl_setattr(
+        xfs_inode_t             *ip,
+        struct fsxattr          *fa,
+        int                     mask)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        unsigned int            lock_flags = 0;
+        struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
+        struct xfs_dquot        *olddquot = NULL;
+        int                     code;
+        xfs_itrace_entry(ip);
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return XFS_ERROR(EROFS);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        /*
+         * If disk quotas is on, we make sure that the dquots do exist on disk,
+         * before we start any other transactions. Trying to do this later
+         * is messy. We don't care to take a readlock to look at the ids
+         * in inode here, because we can't hold it across the trans_reserve.
+         * If the IDs do change before we take the ilock, we're covered
+         * because the i_*dquot fields will get updated anyway.
+         */
+        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+                code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid,
+                                         ip->i_d.di_gid, fa->fsx_projid,
+                                         XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+                if (code)
+                        return code;
+        }
+        /*
+         * For the other attributes, we acquire the inode lock and
+         * first do an error checking pass.
+         */
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        if (code)
+                goto error_return;
+        lock_flags = XFS_ILOCK_EXCL;
+        xfs_ilock(ip, lock_flags);
+        /*
+         * CAP_FOWNER overrides the following restrictions:
+         *
+         * The user ID of the calling process must be equal
+         * to the file owner ID, except in cases where the
+         * CAP_FSETID capability is applicable.
+         */
+        if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+                code = XFS_ERROR(EPERM);
+                goto error_return;
+        }
+        /*
+         * Do a quota reservation only if projid is actually going to change.
+         */
+        if (mask & FSX_PROJID) {
+                if (XFS_IS_PQUOTA_ON(mp) &&
+                    ip->i_d.di_projid != fa->fsx_projid) {
+                        ASSERT(tp);
+                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+                                                capable(CAP_FOWNER) ?
+                                                XFS_QMOPT_FORCE_RES : 0);
+                        if (code)       /* out of quota */
+                                goto error_return;
+                }
+        }
+        if (mask & FSX_EXTSIZE) {
+                /*
+                 * Can't change extent size if any extents are allocated.
+                 */
+                if (ip->i_d.di_nextents &&
+                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+                     fa->fsx_extsize)) {
+                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                        goto error_return;
+                }
+                /*
+                 * Extent size must be a multiple of the appropriate block
+                 * size, if set at all.
+                 */
+                if (fa->fsx_extsize != 0) {
+                        xfs_extlen_t    size;
+                        if (XFS_IS_REALTIME_INODE(ip) ||
+                            ((mask & FSX_XFLAGS) &&
+                            (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
+                                size = mp->m_sb.sb_rextsize <<
+                                       mp->m_sb.sb_blocklog;
+                        } else {
+                                size = mp->m_sb.sb_blocksize;
+                        }
+                        if (fa->fsx_extsize % size) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
+                }
+        }
+        if (mask & FSX_XFLAGS) {
+                /*
+                 * Can't change realtime flag if any extents are allocated.
+                 */
+                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+                    (XFS_IS_REALTIME_INODE(ip)) !=
+                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                        goto error_return;
+                }
+                /*
+                 * If realtime flag is set then must have realtime data.
+                 */
+                if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                        if ((mp->m_sb.sb_rblocks == 0) ||
+                            (mp->m_sb.sb_rextsize == 0) ||
+                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
+                }
+                /*
+                 * Can't modify an immutable/append-only file unless
+                 * we have appropriate permission.
+                 */
+                if ((ip->i_d.di_flags &
+                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+                     (fa->fsx_xflags &
+                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+                    !capable(CAP_LINUX_IMMUTABLE)) {
+                        code = XFS_ERROR(EPERM);
+                        goto error_return;
+                }
+        }
+        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ihold(tp, ip);
+        /*
+         * Change file ownership.  Must be the owner or privileged.
+         * If the system was configured with the "restricted_chown"
+         * option, the owner is not permitted to give away the file,
+         * and can change the group id only to a group of which he
+         * or she is a member.
+         */
+        if (mask & FSX_PROJID) {
+                /*
+                 * CAP_FSETID overrides the following restrictions:
+                 *
+                 * The set-user-ID and set-group-ID bits of a file will be
+                 * cleared upon successful return from chown()
+                 */
+                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+                    !capable(CAP_FSETID))
+                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+                /*
+                 * Change the ownerships and register quota modifications
+                 * in the transaction.
+                 */
+                if (ip->i_d.di_projid != fa->fsx_projid) {
+                        if (XFS_IS_PQUOTA_ON(mp)) {
+                                olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                                        &ip->i_gdquot, gdqp);
+                        }
+                        ip->i_d.di_projid = fa->fsx_projid;
+                        /*
+                         * We may have to rev the inode as well as
+                         * the superblock version number since projids didn't
+                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+                         */
+                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+                                xfs_bump_ino_vers2(tp, ip);
+                }
+        }
+        if (mask & FSX_EXTSIZE)
+                ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+        if (mask & FSX_XFLAGS) {
+                xfs_set_diflags(ip, fa->fsx_xflags);
+                xfs_diflags_to_linux(ip);
+        }
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        XFS_STATS_INC(xs_ig_attrchg);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         * This is slightly sub-optimal in that truncates require
+         * two sync transactions instead of one for wsync filesystems.
+         * One for the truncate and one for the timestamps since we
+         * don't want to change the timestamps unless we're sure the
+         * truncate worked.  Truncates are less than 1% of the laddis
+         * mix so this probably isn't worth the trouble to optimize.
+         */
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(tp);
+        code = xfs_trans_commit(tp, 0);
+        xfs_iunlock(ip, lock_flags);
+        /*
+         * Release any dquot(s) the inode had kept before chown.
+         */
+        XFS_QM_DQRELE(mp, olddquot);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        if (code)
+                return code;
+        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
+                XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
+                                NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
+                                (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
+        }
+        return 0;
+ error_return:
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        xfs_trans_cancel(tp, 0);
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+        return code;
+}
 STATIC int
 xfs_ioc_fssetxattr(
        xfs_inode_t             *ip,
@@ -882,31 +1204,16 @@ xfs_ioc_fssetxattr(
        void                    __user *arg)
 {
        struct fsxattr          fa;
-        struct bhv_vattr        *vattr;
+        unsigned int            mask;
-        int                     error;
-        int                     attr_flags;
        if (copy_from_user(&fa, arg, sizeof(fa)))
                return -EFAULT;
-        vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
+        mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
-        if (unlikely(!vattr))
-                return -ENOMEM;
-        attr_flags = 0;
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                attr_flags |= ATTR_NONBLOCK;
+                mask |= FSX_NONBLOCK;
-        vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-        vattr->va_xflags  = fa.fsx_xflags;
-        vattr->va_extsize = fa.fsx_extsize;
-        vattr->va_projid  = fa.fsx_projid;
-        error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+        return -xfs_ioctl_setattr(ip, &fa, mask);
-        if (!error)
-                vn_revalidate(XFS_ITOV(ip));    /* update flags */
-        kfree(vattr);
-        return 0;
 }
 STATIC int
@@ -928,10 +1235,9 @@ xfs_ioc_setxflags(
        struct file             *filp,
        void                    __user *arg)
 {
-        struct bhv_vattr        *vattr;
+        struct fsxattr          fa;
        unsigned int            flags;
-        int                     attr_flags;
+        unsigned int            mask;
-        int                     error;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -941,22 +1247,12 @@ xfs_ioc_setxflags(
                      FS_SYNC_FL))
                return -EOPNOTSUPP;
-        vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
+        mask = FSX_XFLAGS;
-        if (unlikely(!vattr))
-                return -ENOMEM;
-        attr_flags = 0;
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                attr_flags |= ATTR_NONBLOCK;
+                mask |= FSX_NONBLOCK;
+        fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-        vattr->va_mask = XFS_AT_XFLAGS;
-        vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-        error = -xfs_setattr(ip, vattr, attr_flags, NULL);
+        return -xfs_ioctl_setattr(ip, &fa, mask);
-        if (likely(!error))
-                vn_revalidate(XFS_ITOV(ip));    /* update flags */
-        kfree(vattr);
-        return error;
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2bf287ef5489..095d271f3434 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,7 +62,7 @@ void
 xfs_synchronize_atime(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        if (inode) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
@@ -79,7 +79,7 @@ void
 xfs_mark_inode_dirty_sync(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        if (inode)
                mark_inode_dirty_sync(inode);
@@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync(
 * Change the requested timestamp in the given inode.
 * We don't lock across timestamp updates, and we don't log them but
 * we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- *              with XFS_ICHGTIME_ACC to be sure that access time
- *              update will take.  Calling first with XFS_ICHGTIME_ACC
- *              and then XFS_ICHGTIME_MOD may fail to modify the access
- *              timestamp if the filesystem is mounted noacctm.
 */
 void
 xfs_ichgtime(
        xfs_inode_t     *ip,
        int             flags)
 {
-        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
+        struct inode    *inode = VFS_I(ip);
        timespec_t      tv;
+        int             sync_it = 0;
+        tv = current_fs_time(inode->i_sb);
-        nanotime(&tv);
+        if ((flags & XFS_ICHGTIME_MOD) &&
-        if (flags & XFS_ICHGTIME_MOD) {
+            !timespec_equal(&inode->i_mtime, &tv)) {
                inode->i_mtime = tv;
                ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
                ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+                sync_it = 1;
        }
-        if (flags & XFS_ICHGTIME_ACC) {
+        if ((flags & XFS_ICHGTIME_CHG) &&
-                inode->i_atime = tv;
+            !timespec_equal(&inode->i_ctime, &tv)) {
-                ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
-                ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
-        }
-        if (flags & XFS_ICHGTIME_CHG) {
                inode->i_ctime = tv;
                ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
                ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+                sync_it = 1;
        }
        /*
@@ -130,72 +125,11 @@ xfs_ichgtime(
         * ensure that the compiler does not reorder the update
         * of i_update_core above the timestamp updates above.
         */
-        SYNCHRONIZE();
+        if (sync_it) {
-        ip->i_update_core = 1;
+                SYNCHRONIZE();
-        if (!(inode->i_state & I_NEW))
+                ip->i_update_core = 1;
                mark_inode_dirty_sync(inode);
-}
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
-        xfs_inode_t     *ip,
-        struct inode    *inode,
-        int             flags)
-{
-        timespec_t      *tvp;
-        /*
-         * Atime updates for read() & friends are handled lazily now, and
-         * explicit updates must go through xfs_ichgtime()
-         */
-        ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-        if (flags & XFS_ICHGTIME_MOD) {
-                tvp = &inode->i_mtime;
-                ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
-                ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
-        }
-        if (flags & XFS_ICHGTIME_CHG) {
-                tvp = &inode->i_ctime;
-                ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
-                ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
        }
-        /*
-         * We update the i_update_core field _after_ changing
-         * the timestamps in order to coordinate properly with
-         * xfs_iflush() so that we don't lose timestamp updates.
-         * This keeps us from having to hold the inode lock
-         * while doing this.  We use the SYNCHRONIZE macro to
-         * ensure that the compiler does not reorder the update
-         * of i_update_core above the timestamp updates above.
-         */
-        SYNCHRONIZE();
-        ip->i_update_core = 1;
-        if (!(inode->i_state & I_NEW))
-                mark_inode_dirty_sync(inode);
-}
-/*
- * Pull the link count and size up from the xfs inode to the linux inode
- */
-STATIC void
-xfs_validate_fields(
-        struct inode            *inode)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        loff_t size;
-        /* we're under i_sem so i_size can't change under us */
-        size = XFS_ISIZE(ip);
-        if (i_size_read(inode) != size)
-                i_size_write(inode, size);
 }
 /*
@@ -245,8 +179,7 @@ STATIC void
 xfs_cleanup_inode(
        struct inode    *dir,
        struct inode    *inode,
-        struct dentry   *dentry,
+        struct dentry   *dentry)
-        int             mode)
 {
        struct xfs_name teardown;
@@ -257,10 +190,7 @@ xfs_cleanup_inode(
         */
        xfs_dentry_to_name(&teardown, dentry);
-        if (S_ISDIR(mode))
+        xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-                xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
-        else
-                xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
        iput(inode);
 }
@@ -275,7 +205,7 @@ xfs_vn_mknod(
        struct xfs_inode *ip = NULL;
        xfs_acl_t       *default_acl = NULL;
        struct xfs_name name;
-        attrexists_t    test_default_acl = _ACL_DEFAULT_EXISTS;
+        int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
        int             error;
        /*
@@ -320,7 +250,7 @@ xfs_vn_mknod(
        if (unlikely(error))
                goto out_free_acl;
-        inode = ip->i_vnode;
+        inode = VFS_I(ip);
        error = xfs_init_security(inode, dir);
        if (unlikely(error))
@@ -335,14 +265,11 @@ xfs_vn_mknod(
        }
-        if (S_ISDIR(mode))
-                xfs_validate_fields(inode);
        d_instantiate(dentry, inode);
-        xfs_validate_fields(dir);
        return -error;
 out_cleanup_inode:
-        xfs_cleanup_inode(dir, inode, dentry, mode);
+        xfs_cleanup_inode(dir, inode, dentry);
 out_free_acl:
        if (default_acl)
                _ACL_FREE(default_acl);
@@ -382,7 +309,7 @@ xfs_vn_lookup(
                return ERR_PTR(-ENAMETOOLONG);
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_lookup(XFS_I(dir), &name, &cip);
+        error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
        if (unlikely(error)) {
                if (unlikely(error != ENOENT))
                        return ERR_PTR(-error);
@@ -390,7 +317,47 @@ xfs_vn_lookup(
                return NULL;
        }
-        return d_splice_alias(cip->i_vnode, dentry);
+        return d_splice_alias(VFS_I(cip), dentry);
+}
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        struct nameidata *nd)
+{
+        struct xfs_inode *ip;
+        struct xfs_name xname;
+        struct xfs_name ci_name;
+        struct qstr     dname;
+        int             error;
+        if (dentry->d_name.len >= MAXNAMELEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        xfs_dentry_to_name(&xname, dentry);
+        error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+        if (unlikely(error)) {
+                if (unlikely(error != ENOENT))
+                        return ERR_PTR(-error);
+                /*
+                 * call d_add(dentry, NULL) here when d_drop_negative_children
+                 * is called in xfs_vn_mknod (ie. allow negative dentries
+                 * with CI filesystems).
+                 */
+                return NULL;
+        }
+        /* if exact match, just splice and exit */
+        if (!ci_name.name)
+                return d_splice_alias(VFS_I(ip), dentry);
+        /* else case-insensitive match... */
+        dname.name = ci_name.name;
+        dname.len = ci_name.len;
+        dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+        kmem_free(ci_name.name);
+        return dentry;
 }
 STATIC int
@@ -414,7 +381,6 @@ xfs_vn_link(
        }
        xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
-        xfs_validate_fields(inode);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -424,19 +390,23 @@ xfs_vn_unlink(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-        struct inode    *inode;
        struct xfs_name name;
        int             error;
-        inode = dentry->d_inode;
        xfs_dentry_to_name(&name, dentry);
-        error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
+        error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
-        if (likely(!error)) {
+        if (error)
-                xfs_validate_fields(dir);       /* size needs update */
+                return error;
-                xfs_validate_fields(inode);
-        }
+        /*
-        return -error;
+         * With unlink, the VFS makes the dentry "negative": no inode,
+         * but still hashed. This is incompatible with case-insensitive
+         * mode, so invalidate (unhash) the dentry in CI-mode.
+         */
+        if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+                d_invalidate(dentry);
+        return 0;
 }
 STATIC int
@@ -459,43 +429,22 @@ xfs_vn_symlink(
        if (unlikely(error))
                goto out;
-        inode = cip->i_vnode;
+        inode = VFS_I(cip);
        error = xfs_init_security(inode, dir);
        if (unlikely(error))
                goto out_cleanup_inode;
        d_instantiate(dentry, inode);
-        xfs_validate_fields(dir);
-        xfs_validate_fields(inode);
        return 0;
 out_cleanup_inode:
-        xfs_cleanup_inode(dir, inode, dentry, 0);
+        xfs_cleanup_inode(dir, inode, dentry);
 out:
        return -error;
 }
 STATIC int
-xfs_vn_rmdir(
-        struct inode    *dir,
-        struct dentry   *dentry)
-{
-        struct inode    *inode = dentry->d_inode;
-        struct xfs_name name;
-        int             error;
-        xfs_dentry_to_name(&name, dentry);
-        error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
-        if (likely(!error)) {
-                xfs_validate_fields(inode);
-                xfs_validate_fields(dir);
-        }
-        return -error;
-}
-STATIC int
 xfs_vn_rename(
        struct inode    *odir,
        struct dentry   *odentry,
@@ -505,22 +454,13 @@ xfs_vn_rename(
        struct inode    *new_inode = ndentry->d_inode;
        struct xfs_name oname;
        struct xfs_name nname;
-        int             error;
        xfs_dentry_to_name(&oname, odentry);
        xfs_dentry_to_name(&nname, ndentry);
-        error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+        return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
                           XFS_I(ndir), &nname, new_inode ?
                                                XFS_I(new_inode) : NULL);
-        if (likely(!error)) {
-                if (new_inode)
-                        xfs_validate_fields(new_inode);
-                xfs_validate_fields(odir);
-                if (ndir != odir)
-                        xfs_validate_fields(ndir);
-        }
-        return -error;
 }
 /*
@@ -589,8 +529,7 @@ xfs_check_acl(
 STATIC int
 xfs_vn_permission(
        struct inode            *inode,
-        int                     mask,
+        int                     mask)
-        struct nameidata        *nd)
 {
        return generic_permission(inode, mask, xfs_check_acl);
 }
@@ -660,57 +599,9 @@ xfs_vn_getattr(
 STATIC int
 xfs_vn_setattr(
        struct dentry   *dentry,
-        struct iattr    *attr)
+        struct iattr    *iattr)
 {
-        struct inode    *inode = dentry->d_inode;
+        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
-        unsigned int    ia_valid = attr->ia_valid;
-        bhv_vattr_t     vattr = { 0 };
-        int             flags = 0;
-        int             error;
-        if (ia_valid & ATTR_UID) {
-                vattr.va_mask |= XFS_AT_UID;
-                vattr.va_uid = attr->ia_uid;
-        }
-        if (ia_valid & ATTR_GID) {
-                vattr.va_mask |= XFS_AT_GID;
-                vattr.va_gid = attr->ia_gid;
-        }
-        if (ia_valid & ATTR_SIZE) {
-                vattr.va_mask |= XFS_AT_SIZE;
-                vattr.va_size = attr->ia_size;
-        }
-        if (ia_valid & ATTR_ATIME) {
-                vattr.va_mask |= XFS_AT_ATIME;
-                vattr.va_atime = attr->ia_atime;
-                inode->i_atime = attr->ia_atime;
-        }
-        if (ia_valid & ATTR_MTIME) {
-                vattr.va_mask |= XFS_AT_MTIME;
-                vattr.va_mtime = attr->ia_mtime;
-        }
-        if (ia_valid & ATTR_CTIME) {
-                vattr.va_mask |= XFS_AT_CTIME;
-                vattr.va_ctime = attr->ia_ctime;
-        }
-        if (ia_valid & ATTR_MODE) {
-                vattr.va_mask |= XFS_AT_MODE;
-                vattr.va_mode = attr->ia_mode;
-                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-                        inode->i_mode &= ~S_ISGID;
-        }
-        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
-                flags |= ATTR_UTIME;
-#ifdef ATTR_NO_BLOCK
-        if ((ia_valid & ATTR_NO_BLOCK))
-                flags |= ATTR_NONBLOCK;
-#endif
-        error = xfs_setattr(XFS_I(inode), &vattr, flags, NULL);
-        if (likely(!error))
-                vn_revalidate(vn_from_inode(inode));
-        return -error;
 }
 /*
@@ -728,109 +619,6 @@ xfs_vn_truncate(
        WARN_ON(error);
 }
-STATIC int
-xfs_vn_setxattr(
-        struct dentry   *dentry,
-        const char      *name,
-        const void      *data,
-        size_t          size,
-        int             flags)
-{
-        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
-        char            *attr = (char *)name;
-        attrnames_t     *namesp;
-        int             xflags = 0;
-        int             error;
-        namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-        if (!namesp)
-                return -EOPNOTSUPP;
-        attr += namesp->attr_namelen;
-        error = namesp->attr_capable(vp, NULL);
-        if (error)
-                return error;
-        /* Convert Linux syscall to XFS internal ATTR flags */
-        if (flags & XATTR_CREATE)
-                xflags |= ATTR_CREATE;
-        if (flags & XATTR_REPLACE)
-                xflags |= ATTR_REPLACE;
-        xflags |= namesp->attr_flag;
-        return namesp->attr_set(vp, attr, (void *)data, size, xflags);
-}
-STATIC ssize_t
-xfs_vn_getxattr(
-        struct dentry   *dentry,
-        const char      *name,
-        void            *data,
-        size_t          size)
-{
-        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
-        char            *attr = (char *)name;
-        attrnames_t     *namesp;
-        int             xflags = 0;
-        ssize_t         error;
-        namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-        if (!namesp)
-                return -EOPNOTSUPP;
-        attr += namesp->attr_namelen;
-        error = namesp->attr_capable(vp, NULL);
-        if (error)
-                return error;
-        /* Convert Linux syscall to XFS internal ATTR flags */
-        if (!size) {
-                xflags |= ATTR_KERNOVAL;
-                data = NULL;
-        }
-        xflags |= namesp->attr_flag;
-        return namesp->attr_get(vp, attr, (void *)data, size, xflags);
-}
-STATIC ssize_t
-xfs_vn_listxattr(
-        struct dentry           *dentry,
-        char                    *data,
-        size_t                  size)
-{
-        bhv_vnode_t             *vp = vn_from_inode(dentry->d_inode);
-        int                     error, xflags = ATTR_KERNAMELS;
-        ssize_t                 result;
-        if (!size)
-                xflags |= ATTR_KERNOVAL;
-        xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
-        error = attr_generic_list(vp, data, size, xflags, &result);
-        if (error < 0)
-                return error;
-        return result;
-}
-STATIC int
-xfs_vn_removexattr(
-        struct dentry   *dentry,
-        const char      *name)
-{
-        bhv_vnode_t     *vp = vn_from_inode(dentry->d_inode);
-        char            *attr = (char *)name;
-        attrnames_t     *namesp;
-        int             xflags = 0;
-        int             error;
-        namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-        if (!namesp)
-                return -EOPNOTSUPP;
-        attr += namesp->attr_namelen;
-        error = namesp->attr_capable(vp, NULL);
-        if (error)
-                return error;
-        xflags |= namesp->attr_flag;
-        return namesp->attr_remove(vp, attr, xflags);
-}
 STATIC long
 xfs_vn_fallocate(
        struct inode    *inode,
@@ -854,18 +642,18 @@ xfs_vn_fallocate(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                                0, NULL, ATTR_NOLOCK);
+                                      0, NULL, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
        /* Change file size if needed */
        if (new_size) {
-                bhv_vattr_t     va;
+                struct iattr iattr;
-                va.va_mask = XFS_AT_SIZE;
+                iattr.ia_valid = ATTR_SIZE;
-                va.va_size = new_size;
+                iattr.ia_size = new_size;
-                error = xfs_setattr(ip, &va, ATTR_NOLOCK, NULL);
+                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -873,46 +661,172 @@ out_error:
        return error;
 }
-const struct inode_operations xfs_inode_operations = {
+static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
-        .setxattr               = xfs_vn_setxattr,
+        .setxattr               = generic_setxattr,
-        .getxattr               = xfs_vn_getxattr,
+        .getxattr               = generic_getxattr,
+        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .removexattr            = xfs_vn_removexattr,
        .fallocate              = xfs_vn_fallocate,
 };
-const struct inode_operations xfs_dir_inode_operations = {
+static const struct inode_operations xfs_dir_inode_operations = {
        .create                 = xfs_vn_create,
        .lookup                 = xfs_vn_lookup,
        .link                   = xfs_vn_link,
        .unlink                 = xfs_vn_unlink,
        .symlink                = xfs_vn_symlink,
        .mkdir                  = xfs_vn_mkdir,
-        .rmdir                  = xfs_vn_rmdir,
+        /*
+         * Yes, XFS uses the same method for rmdir and unlink.
+         *
+         * There are some subtile differences deeper in the code,
+         * but we use S_ISDIR to check for those.
+         */
+        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
        .permission             = xfs_vn_permission,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
-        .setxattr               = xfs_vn_setxattr,
+        .setxattr               = generic_setxattr,
-        .getxattr               = xfs_vn_getxattr,
+        .getxattr               = generic_getxattr,
+        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .removexattr            = xfs_vn_removexattr,
 };
-const struct inode_operations xfs_symlink_inode_operations = {
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+        .create                 = xfs_vn_create,
+        .lookup                 = xfs_vn_ci_lookup,
+        .link                   = xfs_vn_link,
+        .unlink                 = xfs_vn_unlink,
+        .symlink                = xfs_vn_symlink,
+        .mkdir                  = xfs_vn_mkdir,
+        /*
+         * Yes, XFS uses the same method for rmdir and unlink.
+         *
+         * There are some subtile differences deeper in the code,
+         * but we use S_ISDIR to check for those.
+         */
+        .rmdir                  = xfs_vn_unlink,
+        .mknod                  = xfs_vn_mknod,
+        .rename                 = xfs_vn_rename,
+        .permission             = xfs_vn_permission,
+        .getattr                = xfs_vn_getattr,
+        .setattr                = xfs_vn_setattr,
+        .setxattr               = generic_setxattr,
+        .getxattr               = generic_getxattr,
+        .removexattr            = generic_removexattr,
+        .listxattr              = xfs_vn_listxattr,
+};
+static const struct inode_operations xfs_symlink_inode_operations = {
        .readlink               = generic_readlink,
        .follow_link            = xfs_vn_follow_link,
        .put_link               = xfs_vn_put_link,
        .permission             = xfs_vn_permission,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
-        .setxattr               = xfs_vn_setxattr,
+        .setxattr               = generic_setxattr,
-        .getxattr               = xfs_vn_getxattr,
+        .getxattr               = generic_getxattr,
+        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
-        .removexattr            = xfs_vn_removexattr,
 };
+STATIC void
+xfs_diflags_to_iflags(
+        struct inode            *inode,
+        struct xfs_inode        *ip)
+{
+        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+                inode->i_flags |= S_SYNC;
+        else
+                inode->i_flags &= ~S_SYNC;
+        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        else
+                inode->i_flags &= ~S_NOATIME;
+}
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ */
+void
+xfs_setup_inode(
+        struct xfs_inode        *ip)
+{
+        struct inode            *inode = ip->i_vnode;
+        inode->i_mode   = ip->i_d.di_mode;
+        inode->i_nlink  = ip->i_d.di_nlink;
+        inode->i_uid    = ip->i_d.di_uid;
+        inode->i_gid    = ip->i_d.di_gid;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_rdev =
+                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+                              sysv_minor(ip->i_df.if_u2.if_rdev));
+                break;
+        default:
+                inode->i_rdev = 0;
+                break;
+        }
+        inode->i_generation = ip->i_d.di_gen;
+        i_size_write(inode, ip->i_d.di_size);
+        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
+        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
+        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
+        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
+        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
+        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
+        xfs_diflags_to_iflags(inode, ip);
+        xfs_iflags_clear(ip, XFS_IMODIFIED);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_op = &xfs_inode_operations;
+                inode->i_fop = &xfs_file_operations;
+                inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        case S_IFDIR:
+                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                        inode->i_op = &xfs_dir_ci_inode_operations;
+                else
+                        inode->i_op = &xfs_dir_inode_operations;
+                inode->i_fop = &xfs_dir_file_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &xfs_symlink_inode_operations;
+                if (!(ip->i_df.if_flags & XFS_IFINLINE))
+                        inode->i_mapping->a_ops = &xfs_address_space_operations;
+                break;
+        default:
+                inode->i_op = &xfs_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        }
+        xfs_iflags_clear(ip, XFS_INEW);
+        barrier();
+        unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 14d0deb7afff..8b1a1e31dc21 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -18,23 +18,14 @@
 #ifndef __XFS_IOPS_H__
 #define __XFS_IOPS_H__
-extern const struct inode_operations xfs_inode_operations;
+struct xfs_inode;
-extern const struct inode_operations xfs_dir_inode_operations;
-extern const struct inode_operations xfs_symlink_inode_operations;
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
 extern const struct file_operations xfs_invis_file_operations;
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-struct xfs_inode;
+extern void xfs_setup_inode(struct xfs_inode *);
-extern void xfs_ichgtime(struct xfs_inode *, int);
-extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int);
-#define xfs_vtoi(vp) \
-        ((struct xfs_inode *)vn_to_inode(vp)->i_private)
-#define XFS_I(inode) \
-        ((struct xfs_inode *)(inode)->i_private)
 #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 4edc46915b57..cc0f7b3a9795 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -45,13 +45,13 @@
 #include <mrlock.h>
 #include <sv.h>
 #include <mutex.h>
-#include <sema.h>
 #include <time.h>
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/uuid.h>
+#include <linux/semaphore.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
@@ -76,6 +76,7 @@
 #include <linux/log2.h>
 #include <linux/spinlock.h>
 #include <linux/random.h>
+#include <linux/ctype.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -125,8 +126,6 @@
 #define current_cpu()           (raw_smp_processor_id())
 #define current_pid()           (current->pid)
-#define current_fsuid(cred)     (current->fsuid)
-#define current_fsgid(cred)     (current->fsgid)
 #define current_test_flags(f)   (current->flags & (f))
 #define current_set_flags_nested(sp, f)         \
                (*(sp) = current->flags, current->flags |= (f))
@@ -179,7 +178,7 @@
 #define xfs_sort(a,n,s,fn)      sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()       dump_stack()
 #define xfs_itruncate_data(ip, off)     \
-        (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off)))
+        (-vmtruncate(VFS_I(ip), (off)))
 /* Move the kernel do_div definition off to one side */
@@ -299,4 +298,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
        return x;
 }
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5e3b57516ec7..1957e5357d04 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -137,7 +137,7 @@ xfs_iozero(
        struct address_space    *mapping;
        int                     status;
-        mapping = ip->i_vnode->i_mapping;
+        mapping = VFS_I(ip)->i_mapping;
        do {
                unsigned offset, bytes;
                void *fsdata;
@@ -674,9 +674,7 @@ start:
         */
        if (likely(!(ioflags & IO_INVIS) &&
                   !mnt_want_write(file->f_path.mnt))) {
-                file_update_time(file);
+                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                xfs_ichgtime_fast(xip, inode,
-                                  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                mnt_drop_write(file->f_path.mnt);
        }
@@ -711,7 +709,7 @@ start:
             !capable(CAP_FSETID)) {
                error = xfs_write_clear_setuid(xip);
                if (likely(!error))
-                        error = -remove_suid(file->f_path.dentry);
+                        error = -file_remove_suid(file);
                if (unlikely(error)) {
                        goto out_unlock_internal;
                }
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index e480b6102051..3d5b67c075c7 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -98,12 +98,21 @@ xfs_read_xfsstats(
        return len;
 }
-void
+int
 xfs_init_procfs(void)
 {
        if (!proc_mkdir("fs/xfs", NULL))
-                return;
+                goto out;
-        create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
+        if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
+                        xfs_read_xfsstats, NULL))
+                goto out_remove_entry;
+        return 0;
+ out_remove_entry:
+        remove_proc_entry("fs/xfs", NULL);
+ out:
+        return -ENOMEM;
 }
 void
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index afd0b0d5fdb2..e83820febc9f 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -134,7 +134,7 @@ DECLARE_PER_CPU(struct xfsstats, xfsstats);
 #define XFS_STATS_DEC(v)        (per_cpu(xfsstats, current_cpu()).v--)
 #define XFS_STATS_ADD(v, inc)   (per_cpu(xfsstats, current_cpu()).v += (inc))
-extern void xfs_init_procfs(void);
+extern int xfs_init_procfs(void);
 extern void xfs_cleanup_procfs(void);
@@ -144,8 +144,14 @@ extern void xfs_cleanup_procfs(void);
 # define XFS_STATS_DEC(count)
 # define XFS_STATS_ADD(count, inc)
-static inline void xfs_init_procfs(void) { };
+static inline int xfs_init_procfs(void)
-static inline void xfs_cleanup_procfs(void) { };
+{
+        return 0;
+}
+static inline void xfs_cleanup_procfs(void)
+{
+}
 #endif  /* !CONFIG_PROC_FS */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c1..18d3c8487835 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -52,6 +52,12 @@
 #include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -60,6 +66,7 @@
 #include <linux/writeback.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/parser.h>
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
@@ -74,7 +81,10 @@ xfs_args_allocate(
 {
        struct xfs_mount_args   *args;
-        args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
+        args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+        if (!args)
+                return NULL;
        args->logbufs = args->logbufsize = -1;
        strncpy(args->fsname, sb->s_id, MAXNAMELEN);
@@ -138,6 +148,23 @@ xfs_args_allocate(
 #define MNTOPT_XDSM     "xdsm"          /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI      "dmi"           /* DMI enabled (DMAPI / XDSM) */
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+        Opt_barrier, Opt_nobarrier, Opt_err
+};
+static match_table_t tokens = {
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
+        {Opt_err, NULL}
+};
 STATIC unsigned long
 suffix_strtoul(char *s, char **endp, unsigned int base)
 {
@@ -314,6 +341,7 @@ xfs_parseargs(
                        args->flags |= XFSMNT_ATTR2;
                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
                        args->flags &= ~XFSMNT_ATTR2;
+                        args->flags |= XFSMNT_NOATTR2;
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
                        args->flags2 |= XFSMNT2_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
@@ -553,115 +581,6 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
-STATIC_INLINE void
-xfs_set_inodeops(
-        struct inode            *inode)
-{
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFREG:
-                inode->i_op = &xfs_inode_operations;
-                inode->i_fop = &xfs_file_operations;
-                inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        case S_IFDIR:
-                inode->i_op = &xfs_dir_inode_operations;
-                inode->i_fop = &xfs_dir_file_operations;
-                break;
-        case S_IFLNK:
-                inode->i_op = &xfs_symlink_inode_operations;
-                if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE))
-                        inode->i_mapping->a_ops = &xfs_address_space_operations;
-                break;
-        default:
-                inode->i_op = &xfs_inode_operations;
-                init_special_inode(inode, inode->i_mode, inode->i_rdev);
-                break;
-        }
-}
-STATIC_INLINE void
-xfs_revalidate_inode(
-        xfs_mount_t             *mp,
-        bhv_vnode_t             *vp,
-        xfs_inode_t             *ip)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        inode->i_mode   = ip->i_d.di_mode;
-        inode->i_nlink  = ip->i_d.di_nlink;
-        inode->i_uid    = ip->i_d.di_uid;
-        inode->i_gid    = ip->i_d.di_gid;
-        switch (inode->i_mode & S_IFMT) {
-        case S_IFBLK:
-        case S_IFCHR:
-                inode->i_rdev =
-                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-                              sysv_minor(ip->i_df.if_u2.if_rdev));
-                break;
-        default:
-                inode->i_rdev = 0;
-                break;
-        }
-        inode->i_generation = ip->i_d.di_gen;
-        i_size_write(inode, ip->i_d.di_size);
-        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
-        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
-        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
-        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
-        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
-        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
-        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
-                inode->i_flags |= S_IMMUTABLE;
-        else
-                inode->i_flags &= ~S_IMMUTABLE;
-        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
-                inode->i_flags |= S_APPEND;
-        else
-                inode->i_flags &= ~S_APPEND;
-        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
-                inode->i_flags |= S_SYNC;
-        else
-                inode->i_flags &= ~S_SYNC;
-        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
-                inode->i_flags |= S_NOATIME;
-        else
-                inode->i_flags &= ~S_NOATIME;
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
-}
-void
-xfs_initialize_vnode(
-        struct xfs_mount        *mp,
-        bhv_vnode_t             *vp,
-        struct xfs_inode        *ip)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        if (!ip->i_vnode) {
-                ip->i_vnode = vp;
-                inode->i_private = ip;
-        }
-        /*
-         * We need to set the ops vectors, and unlock the inode, but if
-         * we have been called during the new inode create process, it is
-         * too early to fill in the Linux inode.  We will get called a
-         * second time once the inode is properly set up, and then we can
-         * finish our work.
-         */
-        if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) {
-                xfs_revalidate_inode(mp, vp, ip);
-                xfs_set_inodeops(inode);
-                xfs_iflags_clear(ip, XFS_INEW);
-                barrier();
-                unlock_new_inode(inode);
-        }
-}
 int
 xfs_blkdev_get(
        xfs_mount_t             *mp,
@@ -733,14 +652,6 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
                return;
        }
-        if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
-                                        QUEUE_ORDERED_NONE) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
-                  "Disabling barriers, not supported by the underlying device");
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                return;
-        }
        if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
                xfs_fs_cmn_err(CE_NOTE, mp,
                  "Disabling barriers, underlying device is readonly");
@@ -764,6 +675,139 @@ xfs_blkdev_issue_flush(
        blkdev_issue_flush(buftarg->bt_bdev, NULL);
 }
+STATIC void
+xfs_close_devices(
+        struct xfs_mount        *mp)
+{
+        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+                struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+                xfs_free_buftarg(mp->m_logdev_targp);
+                xfs_blkdev_put(logdev);
+        }
+        if (mp->m_rtdev_targp) {
+                struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+                xfs_free_buftarg(mp->m_rtdev_targp);
+                xfs_blkdev_put(rtdev);
+        }
+        xfs_free_buftarg(mp->m_ddev_targp);
+}
+/*
+ * The file system configurations are:
+ *      (1) device (partition) with data and internal log
+ *      (2) logical volume with data and log subvolumes.
+ *      (3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+        struct xfs_mount        *mp,
+        struct xfs_mount_args   *args)
+{
+        struct block_device     *ddev = mp->m_super->s_bdev;
+        struct block_device     *logdev = NULL, *rtdev = NULL;
+        int                     error;
+        /*
+         * Open real time and log devices - order is important.
+         */
+        if (args->logname[0]) {
+                error = xfs_blkdev_get(mp, args->logname, &logdev);
+                if (error)
+                        goto out;
+        }
+        if (args->rtname[0]) {
+                error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+                if (error)
+                        goto out_close_logdev;
+                if (rtdev == ddev || rtdev == logdev) {
+                        cmn_err(CE_WARN,
+        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+                        error = EINVAL;
+                        goto out_close_rtdev;
+                }
+        }
+        /*
+         * Setup xfs_mount buffer target pointers
+         */
+        error = ENOMEM;
+        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+        if (!mp->m_ddev_targp)
+                goto out_close_rtdev;
+        if (rtdev) {
+                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+                if (!mp->m_rtdev_targp)
+                        goto out_free_ddev_targ;
+        }
+        if (logdev && logdev != ddev) {
+                mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+                if (!mp->m_logdev_targp)
+                        goto out_free_rtdev_targ;
+        } else {
+                mp->m_logdev_targp = mp->m_ddev_targp;
+        }
+        return 0;
+ out_free_rtdev_targ:
+        if (mp->m_rtdev_targp)
+                xfs_free_buftarg(mp->m_rtdev_targp);
+ out_free_ddev_targ:
+        xfs_free_buftarg(mp->m_ddev_targp);
+ out_close_rtdev:
+        if (rtdev)
+                xfs_blkdev_put(rtdev);
+ out_close_logdev:
+        if (logdev && logdev != ddev)
+                xfs_blkdev_put(logdev);
+ out:
+        return error;
+}
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+        struct xfs_mount        *mp)
+{
+        int                     error;
+        error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+                                    mp->m_sb.sb_sectsize);
+        if (error)
+                return error;
+        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+                unsigned int    log_sector_size = BBSIZE;
+                if (xfs_sb_version_hassector(&mp->m_sb))
+                        log_sector_size = mp->m_sb.sb_logsectsize;
+                error = xfs_setsize_buftarg(mp->m_logdev_targp,
+                                            mp->m_sb.sb_blocksize,
+                                            log_sector_size);
+                if (error)
+                        return error;
+        }
+        if (mp->m_rtdev_targp) {
+                error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+                                            mp->m_sb.sb_blocksize,
+                                            mp->m_sb.sb_sectsize);
+                if (error)
+                        return error;
+        }
+        return 0;
+}
 /*
 * XFS AIL push thread support
 */
@@ -826,63 +870,21 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        bhv_vnode_t             *vp;
+        return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-        vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-        if (unlikely(!vp))
-                return NULL;
-        return vn_to_inode(vp);
 }
 STATIC void
 xfs_fs_destroy_inode(
        struct inode            *inode)
 {
-        kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode));
+        kmem_zone_free(xfs_vnode_zone, inode);
 }
 STATIC void
 xfs_fs_inode_init_once(
-        kmem_zone_t             *zonep,
        void                    *vnode)
 {
-        inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
+        inode_init_once((struct inode *)vnode);
-}
-STATIC int __init
-xfs_init_zones(void)
-{
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                        KM_ZONE_SPREAD,
-                                        xfs_fs_inode_init_once);
-        if (!xfs_vnode_zone)
-                goto out;
-        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-        if (!xfs_ioend_zone)
-                goto out_destroy_vnode_zone;
-        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-                                                  xfs_ioend_zone);
-        if (!xfs_ioend_pool)
-                goto out_free_ioend_zone;
-        return 0;
- out_free_ioend_zone:
-        kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-        kmem_zone_destroy(xfs_vnode_zone);
- out:
-        return -ENOMEM;
-}
-STATIC void
-xfs_destroy_zones(void)
-{
-        mempool_destroy(xfs_ioend_pool);
-        kmem_zone_destroy(xfs_vnode_zone);
-        kmem_zone_destroy(xfs_ioend_zone);
 }
 /*
@@ -987,7 +989,7 @@ void
 xfs_flush_inode(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
@@ -1012,7 +1014,7 @@ void
 xfs_flush_device(
        xfs_inode_t     *ip)
 {
-        struct inode    *inode = vn_to_inode(XFS_ITOV(ip));
+        struct inode    *inode = VFS_I(ip);
        igrab(inode);
        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
@@ -1074,7 +1076,7 @@ xfssyncd(
                        list_del(&work->w_list);
                        if (work == &mp->m_sync_work)
                                continue;
-                        kmem_free(work, sizeof(struct bhv_vfs_sync_work));
+                        kmem_free(work);
                }
        }
@@ -1082,18 +1084,76 @@ xfssyncd(
 }
 STATIC void
+xfs_free_fsname(
+        struct xfs_mount        *mp)
+{
+        kfree(mp->m_fsname);
+        kfree(mp->m_rtname);
+        kfree(mp->m_logname);
+}
+STATIC void
 xfs_fs_put_super(
        struct super_block      *sb)
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        struct xfs_inode        *rip = mp->m_rootip;
+        int                     unmount_event_flags = 0;
        int                     error;
        kthread_stop(mp->m_sync_task);
        xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
-        error = xfs_unmount(mp, 0, NULL);
-        if (error)
+#ifdef HAVE_DMAPI
-                printk("XFS: unmount got error=%d\n", error);
+        if (mp->m_flags & XFS_MOUNT_DMAPI) {
+                unmount_event_flags =
+                        (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ?
+                                0 : DM_FLAGS_UNWANTED;
+                /*
+                 * Ignore error from dmapi here, first unmount is not allowed
+                 * to fail anyway, and second we wouldn't want to fail a
+                 * unmount because of dmapi.
+                 */
+                XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
+                                NULL, NULL, 0, 0, unmount_event_flags);
+        }
+#endif
+        /*
+         * Blow away any referenced inode in the filestreams cache.
+         * This can and will cause log traffic as inodes go inactive
+         * here.
+         */
+        xfs_filestream_unmount(mp);
+        XFS_bflush(mp->m_ddev_targp);
+        error = xfs_unmount_flush(mp, 0);
+        WARN_ON(error);
+        /*
+         * If we're forcing a shutdown, typically because of a media error,
+         * we want to make sure we invalidate dirty pages that belong to
+         * referenced vnodes as well.
+         */
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
+                ASSERT(error != EFSCORRUPTED);
+        }
+        if (mp->m_flags & XFS_MOUNT_DMAPI) {
+                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
+                                unmount_event_flags);
+        }
+        xfs_unmountfs(mp);
+        xfs_freesb(mp);
+        xfs_icsb_destroy_counters(mp);
+        xfs_close_devices(mp);
+        xfs_qmops_put(mp);
+        xfs_dmops_put(mp);
+        xfs_free_fsname(mp);
+        kfree(mp);
 }
 STATIC void
@@ -1216,14 +1276,74 @@ xfs_fs_remount(
        char                    *options)
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        struct xfs_mount_args   *args = xfs_args_allocate(sb, 0);
+        substring_t             args[MAX_OPT_ARGS];
-        int                     error;
+        char                    *p;
-        error = xfs_parseargs(mp, options, args, 1);
+        while ((p = strsep(&options, ",")) != NULL) {
-        if (!error)
+                int token;
-                error = xfs_mntupdate(mp, flags, args);
-        kmem_free(args, sizeof(*args));
+                if (!*p)
-        return -error;
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_barrier:
+                        mp->m_flags |= XFS_MOUNT_BARRIER;
+                        /*
+                         * Test if barriers are actually working if we can,
+                         * else delay this check until the filesystem is
+                         * marked writeable.
+                         */
+                        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
+                                xfs_mountfs_check_barriers(mp);
+                        break;
+                case Opt_nobarrier:
+                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
+                        break;
+                default:
+                        /*
+                         * Logically we would return an error here to prevent
+                         * users from believing they might have changed
+                         * mount options using remount which can't be changed.
+                         *
+                         * But unfortunately mount(8) adds all options from
+                         * mtab and fstab to the mount arguments in some cases
+                         * so we can't blindly reject options, but have to
+                         * check for each specified option if it actually
+                         * differs from the currently set option and only
+                         * reject it if that's the case.
+                         *
+                         * Until that is implemented we return success for
+                         * every remount request, and silently ignore all
+                         * options that we can't actually change.
+                         */
+#if 0
+                        printk(KERN_INFO
+        "XFS: mount option \"%s\" not supported for remount\n", p);
+                        return -EINVAL;
+#else
+                        return 0;
+#endif
+                }
+        }
+        /* rw/ro -> rw */
+        if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+                mp->m_flags &= ~XFS_MOUNT_RDONLY;
+                if (mp->m_flags & XFS_MOUNT_BARRIER)
+                        xfs_mountfs_check_barriers(mp);
+        }
+        /* rw -> ro */
+        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+                xfs_filestream_flush(mp);
+                xfs_sync(mp, SYNC_DATA_QUIESCE);
+                xfs_attr_quiesce(mp);
+                mp->m_flags |= XFS_MOUNT_RDONLY;
+        }
+        return 0;
 }
 /*
@@ -1300,6 +1420,245 @@ xfs_fs_setxquota(
                                   Q_XSETPQLIM), id, (caddr_t)fdq);
 }
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ */
+STATIC int
+xfs_start_flags(
+        struct xfs_mount_args   *ap,
+        struct xfs_mount        *mp)
+{
+        int                     error;
+        /* Values are in BBs */
+        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+                /*
+                 * At this point the superblock has not been read
+                 * in, therefore we do not know the block size.
+                 * Before the mount call ends we will convert
+                 * these to FSBs.
+                 */
+                mp->m_dalign = ap->sunit;
+                mp->m_swidth = ap->swidth;
+        }
+        if (ap->logbufs != -1 &&
+            ap->logbufs != 0 &&
+            (ap->logbufs < XLOG_MIN_ICLOGS ||
+             ap->logbufs > XLOG_MAX_ICLOGS)) {
+                cmn_err(CE_WARN,
+                        "XFS: invalid logbufs value: %d [not %d-%d]",
+                        ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+                return XFS_ERROR(EINVAL);
+        }
+        mp->m_logbufs = ap->logbufs;
+        if (ap->logbufsize != -1 &&
+            ap->logbufsize !=  0 &&
+            (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
+             ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
+             !is_power_of_2(ap->logbufsize))) {
+                cmn_err(CE_WARN,
+        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        ap->logbufsize);
+                return XFS_ERROR(EINVAL);
+        }
+        error = ENOMEM;
+        mp->m_logbsize = ap->logbufsize;
+        mp->m_fsname_len = strlen(ap->fsname) + 1;
+        mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
+        if (!mp->m_fsname)
+                goto out;
+        if (ap->rtname[0]) {
+                mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
+                if (!mp->m_rtname)
+                        goto out_free_fsname;
+        }
+        if (ap->logname[0]) {
+                mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
+                if (!mp->m_logname)
+                        goto out_free_rtname;
+        }
+        if (ap->flags & XFSMNT_WSYNC)
+                mp->m_flags |= XFS_MOUNT_WSYNC;
+#if XFS_BIG_INUMS
+        if (ap->flags & XFSMNT_INO64) {
+                mp->m_flags |= XFS_MOUNT_INO64;
+                mp->m_inoadd = XFS_INO64_OFFSET;
+        }
+#endif
+        if (ap->flags & XFSMNT_RETERR)
+                mp->m_flags |= XFS_MOUNT_RETERR;
+        if (ap->flags & XFSMNT_NOALIGN)
+                mp->m_flags |= XFS_MOUNT_NOALIGN;
+        if (ap->flags & XFSMNT_SWALLOC)
+                mp->m_flags |= XFS_MOUNT_SWALLOC;
+        if (ap->flags & XFSMNT_OSYNCISOSYNC)
+                mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
+        if (ap->flags & XFSMNT_32BITINODES)
+                mp->m_flags |= XFS_MOUNT_32BITINODES;
+        if (ap->flags & XFSMNT_IOSIZE) {
+                if (ap->iosizelog > XFS_MAX_IO_LOG ||
+                    ap->iosizelog < XFS_MIN_IO_LOG) {
+                        cmn_err(CE_WARN,
+                "XFS: invalid log iosize: %d [not %d-%d]",
+                                ap->iosizelog, XFS_MIN_IO_LOG,
+                                XFS_MAX_IO_LOG);
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
+        }
+        if (ap->flags & XFSMNT_IKEEP)
+                mp->m_flags |= XFS_MOUNT_IKEEP;
+        if (ap->flags & XFSMNT_DIRSYNC)
+                mp->m_flags |= XFS_MOUNT_DIRSYNC;
+        if (ap->flags & XFSMNT_ATTR2)
+                mp->m_flags |= XFS_MOUNT_ATTR2;
+        if (ap->flags & XFSMNT_NOATTR2)
+                mp->m_flags |= XFS_MOUNT_NOATTR2;
+        if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
+                mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+        /*
+         * no recovery flag requires a read-only mount
+         */
+        if (ap->flags & XFSMNT_NORECOVERY) {
+                if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                        cmn_err(CE_WARN,
+        "XFS: tried to mount a FS read-write without recovery!");
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_NORECOVERY;
+        }
+        if (ap->flags & XFSMNT_NOUUID)
+                mp->m_flags |= XFS_MOUNT_NOUUID;
+        if (ap->flags & XFSMNT_BARRIER)
+                mp->m_flags |= XFS_MOUNT_BARRIER;
+        else
+                mp->m_flags &= ~XFS_MOUNT_BARRIER;
+        if (ap->flags2 & XFSMNT2_FILESTREAMS)
+                mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+        if (ap->flags & XFSMNT_DMAPI)
+                mp->m_flags |= XFS_MOUNT_DMAPI;
+        return 0;
+ out_free_rtname:
+        kfree(mp->m_rtname);
+ out_free_fsname:
+        kfree(mp->m_fsname);
+ out:
+        return error;
+}
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+        struct xfs_mount_args   *ap,
+        struct xfs_mount        *mp)
+{
+        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+        /* Fail a mount where the logbuf is smaller then the log stripe */
+        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+                if ((ap->logbufsize <= 0) &&
+                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+                        mp->m_logbsize = mp->m_sb.sb_logsunit;
+                } else if (ap->logbufsize > 0 &&
+                           ap->logbufsize < mp->m_sb.sb_logsunit) {
+                        cmn_err(CE_WARN,
+        "XFS: logbuf size must be greater than or equal to log stripe size");
+                        return XFS_ERROR(EINVAL);
+                }
+        } else {
+                /* Fail a mount if the logbuf is larger than 32K */
+                if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+                        cmn_err(CE_WARN,
+        "XFS: logbuf size for version 1 logs must be 16K or 32K");
+                        return XFS_ERROR(EINVAL);
+                }
+        }
+        /*
+         * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+         * told by noattr2 to turn it off
+         */
+        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+            !(ap->flags & XFSMNT_NOATTR2))
+                mp->m_flags |= XFS_MOUNT_ATTR2;
+        /*
+         * prohibit r/w mounts of read-only filesystems
+         */
+        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+                cmn_err(CE_WARN,
+        "XFS: cannot mount a read-only filesystem as read-write");
+                return XFS_ERROR(EROFS);
+        }
+        /*
+         * check for shared mount.
+         */
+        if (ap->flags & XFSMNT_SHARED) {
+                if (!xfs_sb_version_hasshared(&mp->m_sb))
+                        return XFS_ERROR(EINVAL);
+                /*
+                 * For IRIX 6.5, shared mounts must have the shared
+                 * version bit set, have the persistent readonly
+                 * field set, must be version 0 and can only be mounted
+                 * read-only.
+                 */
+                if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
+                     (mp->m_sb.sb_shared_vn != 0))
+                        return XFS_ERROR(EINVAL);
+                mp->m_flags |= XFS_MOUNT_SHARED;
+                /*
+                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
+                 */
+                if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
+                        return XFS_ERROR(EINVAL);
+        }
+        if (ap->flags & XFSMNT_UQUOTA) {
+                mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+                if (ap->flags & XFSMNT_UQUOTAENF)
+                        mp->m_qflags |= XFS_UQUOTA_ENFD;
+        }
+        if (ap->flags & XFSMNT_GQUOTA) {
+                mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+                if (ap->flags & XFSMNT_GQUOTAENF)
+                        mp->m_qflags |= XFS_OQUOTA_ENFD;
+        } else if (ap->flags & XFSMNT_PQUOTA) {
+                mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+                if (ap->flags & XFSMNT_PQUOTAENF)
+                        mp->m_qflags |= XFS_OQUOTA_ENFD;
+        }
+        return 0;
+}
 STATIC int
 xfs_fs_fill_super(
        struct super_block      *sb,
@@ -1308,11 +1667,21 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-        struct xfs_mount_args   *args = xfs_args_allocate(sb, silent);
+        struct xfs_mount_args   *args;
-        int                     error;
+        int                     flags = 0, error = ENOMEM;
-        mp = xfs_mount_init();
+        args = xfs_args_allocate(sb, silent);
+        if (!args)
+                return -ENOMEM;
+        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+        if (!mp)
+                goto out_free_args;
+        spin_lock_init(&mp->m_sb_lock);
+        mutex_init(&mp->m_ilock);
+        mutex_init(&mp->m_growlock);
+        atomic_set(&mp->m_active_trans, 0);
        INIT_LIST_HEAD(&mp->m_sync_list);
        spin_lock_init(&mp->m_sync_lock);
        init_waitqueue_head(&mp->m_wait_single_sync_task);
@@ -1325,16 +1694,60 @@ xfs_fs_fill_super(
        error = xfs_parseargs(mp, (char *)data, args, 0);
        if (error)
-                goto fail_vfsop;
+                goto out_free_mp;
        sb_min_blocksize(sb, BBSIZE);
+        sb->s_xattr = xfs_xattr_handlers;
        sb->s_export_op = &xfs_export_operations;
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
-        error = xfs_mount(mp, args, NULL);
+        error = xfs_dmops_get(mp, args);
+        if (error)
+                goto out_free_mp;
+        error = xfs_qmops_get(mp, args);
+        if (error)
+                goto out_put_dmops;
+        if (args->flags & XFSMNT_QUIET)
+                flags |= XFS_MFSI_QUIET;
+        error = xfs_open_devices(mp, args);
+        if (error)
+                goto out_put_qmops;
+        if (xfs_icsb_init_counters(mp))
+                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+        /*
+         * Setup flags based on mount(2) options and then the superblock
+         */
+        error = xfs_start_flags(args, mp);
+        if (error)
+                goto out_free_fsname;
+        error = xfs_readsb(mp, flags);
+        if (error)
+                goto out_free_fsname;
+        error = xfs_finish_flags(args, mp);
        if (error)
-                goto fail_vfsop;
+                goto out_free_sb;
+        error = xfs_setup_devices(mp);
+        if (error)
+                goto out_free_sb;
+        if (mp->m_flags & XFS_MOUNT_BARRIER)
+                xfs_mountfs_check_barriers(mp);
+        error = xfs_filestream_mount(mp);
+        if (error)
+                goto out_free_sb;
+        error = xfs_mountfs(mp);
+        if (error)
+                goto out_filestream_unmount;
+        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
@@ -1344,7 +1757,7 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        root = igrab(mp->m_rootip->i_vnode);
+        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = ENOENT;
                goto fail_unmount;
@@ -1369,10 +1782,28 @@ xfs_fs_fill_super(
        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
-        kmem_free(args, sizeof(*args));
+        kfree(args);
        return 0;
-fail_vnrele:
+ out_filestream_unmount:
+        xfs_filestream_unmount(mp);
+ out_free_sb:
+        xfs_freesb(mp);
+ out_free_fsname:
+        xfs_free_fsname(mp);
+        xfs_icsb_destroy_counters(mp);
+        xfs_close_devices(mp);
+ out_put_qmops:
+        xfs_qmops_put(mp);
+ out_put_dmops:
+        xfs_dmops_put(mp);
+ out_free_mp:
+        kfree(mp);
+ out_free_args:
+        kfree(args);
+        return -error;
+ fail_vnrele:
        if (sb->s_root) {
                dput(sb->s_root);
                sb->s_root = NULL;
@@ -1380,12 +1811,20 @@ fail_vnrele:
                iput(root);
        }
-fail_unmount:
+ fail_unmount:
-        xfs_unmount(mp, 0, NULL);
+        /*
+         * Blow away any referenced inode in the filestreams cache.
+         * This can and will cause log traffic as inodes go inactive
+         * here.
+         */
+        xfs_filestream_unmount(mp);
-fail_vfsop:
+        XFS_bflush(mp->m_ddev_targp);
-        kmem_free(args, sizeof(*args));
+        error = xfs_unmount_flush(mp, 0);
-        return -error;
+        WARN_ON(error);
+        xfs_unmountfs(mp);
+        goto out_free_sb;
 }
 STATIC int
@@ -1430,9 +1869,235 @@ static struct file_system_type xfs_fs_type = {
        .fs_flags               = FS_REQUIRES_DEV,
 };
+STATIC int __init
+xfs_alloc_trace_bufs(void)
+{
+#ifdef XFS_ALLOC_TRACE
+        xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_alloc_trace_buf)
+                goto out;
+#endif
+#ifdef XFS_BMAP_TRACE
+        xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_bmap_trace_buf)
+                goto out_free_alloc_trace;
+#endif
+#ifdef XFS_BMBT_TRACE
+        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_bmbt_trace_buf)
+                goto out_free_bmap_trace;
+#endif
+#ifdef XFS_ATTR_TRACE
+        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_attr_trace_buf)
+                goto out_free_bmbt_trace;
+#endif
+#ifdef XFS_DIR2_TRACE
+        xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_dir2_trace_buf)
+                goto out_free_attr_trace;
+#endif
+        return 0;
+#ifdef XFS_DIR2_TRACE
+ out_free_attr_trace:
+#endif
+#ifdef XFS_ATTR_TRACE
+        ktrace_free(xfs_attr_trace_buf);
+ out_free_bmbt_trace:
+#endif
+#ifdef XFS_BMBT_TRACE
+        ktrace_free(xfs_bmbt_trace_buf);
+ out_free_bmap_trace:
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(xfs_bmap_trace_buf);
+ out_free_alloc_trace:
+#endif
+#ifdef XFS_ALLOC_TRACE
+        ktrace_free(xfs_alloc_trace_buf);
+ out:
+#endif
+        return -ENOMEM;
+}
+STATIC void
+xfs_free_trace_bufs(void)
+{
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(xfs_dir2_trace_buf);
+#endif
+#ifdef XFS_ATTR_TRACE
+        ktrace_free(xfs_attr_trace_buf);
+#endif
+#ifdef XFS_BMBT_TRACE
+        ktrace_free(xfs_bmbt_trace_buf);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(xfs_bmap_trace_buf);
+#endif
+#ifdef XFS_ALLOC_TRACE
+        ktrace_free(xfs_alloc_trace_buf);
+#endif
+}
+STATIC int __init
+xfs_init_zones(void)
+{
+        xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
+                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+                                        KM_ZONE_SPREAD,
+                                        xfs_fs_inode_init_once);
+        if (!xfs_vnode_zone)
+                goto out;
+        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+        if (!xfs_ioend_zone)
+                goto out_destroy_vnode_zone;
+        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+                                                  xfs_ioend_zone);
+        if (!xfs_ioend_pool)
+                goto out_destroy_ioend_zone;
+        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+                                                "xfs_log_ticket");
+        if (!xfs_log_ticket_zone)
+                goto out_destroy_ioend_pool;
+        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+                                                "xfs_bmap_free_item");
+        if (!xfs_bmap_free_item_zone)
+                goto out_destroy_log_ticket_zone;
+        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+                                                "xfs_btree_cur");
+        if (!xfs_btree_cur_zone)
+                goto out_destroy_bmap_free_item_zone;
+        xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+                                                "xfs_da_state");
+        if (!xfs_da_state_zone)
+                goto out_destroy_btree_cur_zone;
+        xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+        if (!xfs_dabuf_zone)
+                goto out_destroy_da_state_zone;
+        xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+        if (!xfs_ifork_zone)
+                goto out_destroy_dabuf_zone;
+        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+        if (!xfs_trans_zone)
+                goto out_destroy_ifork_zone;
+        /*
+         * The size of the zone allocated buf log item is the maximum
+         * size possible under XFS.  This wastes a little bit of memory,
+         * but it is much faster.
+         */
+        xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+                                  NBWORD) * sizeof(int))), "xfs_buf_item");
+        if (!xfs_buf_item_zone)
+                goto out_destroy_trans_zone;
+        xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+                        ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+                                 sizeof(xfs_extent_t))), "xfs_efd_item");
+        if (!xfs_efd_zone)
+                goto out_destroy_buf_item_zone;
+        xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+                        ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+                                sizeof(xfs_extent_t))), "xfs_efi_item");
+        if (!xfs_efi_zone)
+                goto out_destroy_efd_zone;
+        xfs_inode_zone =
+                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+                                        KM_ZONE_SPREAD, NULL);
+        if (!xfs_inode_zone)
+                goto out_destroy_efi_zone;
+        xfs_ili_zone =
+                kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+                                        KM_ZONE_SPREAD, NULL);
+        if (!xfs_ili_zone)
+                goto out_destroy_inode_zone;
+#ifdef CONFIG_XFS_POSIX_ACL
+        xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
+        if (!xfs_acl_zone)
+                goto out_destroy_ili_zone;
+#endif
+        return 0;
+#ifdef CONFIG_XFS_POSIX_ACL
+ out_destroy_ili_zone:
+#endif
+        kmem_zone_destroy(xfs_ili_zone);
+ out_destroy_inode_zone:
+        kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+        kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+        kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+        kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_trans_zone:
+        kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+        kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_dabuf_zone:
+        kmem_zone_destroy(xfs_dabuf_zone);
+ out_destroy_da_state_zone:
+        kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+        kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+        kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+        kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+        mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+        kmem_zone_destroy(xfs_ioend_zone);
+ out_destroy_vnode_zone:
+        kmem_zone_destroy(xfs_vnode_zone);
+ out:
+        return -ENOMEM;
+}
+STATIC void
+xfs_destroy_zones(void)
+{
+#ifdef CONFIG_XFS_POSIX_ACL
+        kmem_zone_destroy(xfs_acl_zone);
+#endif
+        kmem_zone_destroy(xfs_ili_zone);
+        kmem_zone_destroy(xfs_inode_zone);
+        kmem_zone_destroy(xfs_efi_zone);
+        kmem_zone_destroy(xfs_efd_zone);
+        kmem_zone_destroy(xfs_buf_item_zone);
+        kmem_zone_destroy(xfs_trans_zone);
+        kmem_zone_destroy(xfs_ifork_zone);
+        kmem_zone_destroy(xfs_dabuf_zone);
+        kmem_zone_destroy(xfs_da_state_zone);
+        kmem_zone_destroy(xfs_btree_cur_zone);
+        kmem_zone_destroy(xfs_bmap_free_item_zone);
+        kmem_zone_destroy(xfs_log_ticket_zone);
+        mempool_destroy(xfs_ioend_pool);
+        kmem_zone_destroy(xfs_ioend_zone);
+        kmem_zone_destroy(xfs_vnode_zone);
+}
 STATIC int __init
-init_xfs_fs( void )
+init_xfs_fs(void)
 {
        int                     error;
        static char             message[] __initdata = KERN_INFO \
@@ -1441,42 +2106,73 @@ init_xfs_fs( void )
        printk(message);
        ktrace_init(64);
+        vn_init();
+        xfs_dir_startup();
        error = xfs_init_zones();
-        if (error < 0)
+        if (error)
-                goto undo_zones;
+                goto out;
+        error = xfs_alloc_trace_bufs();
+        if (error)
+                goto out_destroy_zones;
+        error = xfs_mru_cache_init();
+        if (error)
+                goto out_free_trace_buffers;
+        error = xfs_filestream_init();
+        if (error)
+                goto out_mru_cache_uninit;
        error = xfs_buf_init();
-        if (error < 0)
+        if (error)
-                goto undo_buffers;
+                goto out_filestream_uninit;
+        error = xfs_init_procfs();
+        if (error)
+                goto out_buf_terminate;
+        error = xfs_sysctl_register();
+        if (error)
+                goto out_cleanup_procfs;
-        vn_init();
-        xfs_init();
-        uuid_init();
        vfs_initquota();
        error = register_filesystem(&xfs_fs_type);
        if (error)
-                goto undo_register;
+                goto out_sysctl_unregister;
        return 0;
-undo_register:
+ out_sysctl_unregister:
+        xfs_sysctl_unregister();
+ out_cleanup_procfs:
+        xfs_cleanup_procfs();
+ out_buf_terminate:
        xfs_buf_terminate();
+ out_filestream_uninit:
-undo_buffers:
+        xfs_filestream_uninit();
+ out_mru_cache_uninit:
+        xfs_mru_cache_uninit();
+ out_free_trace_buffers:
+        xfs_free_trace_bufs();
+ out_destroy_zones:
        xfs_destroy_zones();
+ out:
-undo_zones:
        return error;
 }
 STATIC void __exit
-exit_xfs_fs( void )
+exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
-        xfs_cleanup();
+        xfs_sysctl_unregister();
+        xfs_cleanup_procfs();
        xfs_buf_terminate();
+        xfs_filestream_uninit();
+        xfs_mru_cache_uninit();
+        xfs_free_trace_bufs();
        xfs_destroy_zones();
        ktrace_uninit();
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efb7c6d3303..fe2ef4e6a0f9 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -101,18 +101,13 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp,
-                struct xfs_inode *ip);
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
-extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
-                                struct block_device **);
-extern void xfs_blkdev_put(struct block_device *);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
+extern struct xattr_handler *xfs_xattr_handlers[];
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index bb997d75c05c..7dacb5bbde3f 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -259,15 +259,17 @@ static ctl_table xfs_root_table[] = {
        {}
 };
-void
+int
 xfs_sysctl_register(void)
 {
        xfs_table_header = register_sysctl_table(xfs_root_table);
+        if (!xfs_table_header)
+                return -ENOMEM;
+        return 0;
 }
 void
 xfs_sysctl_unregister(void)
 {
-        if (xfs_table_header)
+        unregister_sysctl_table(xfs_table_header);
-                unregister_sysctl_table(xfs_table_header);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 98b97e399d6f..4aadb8056c37 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -93,10 +93,10 @@ enum {
 extern xfs_param_t      xfs_params;
 #ifdef CONFIG_SYSCTL
-extern void xfs_sysctl_register(void);
+extern int xfs_sysctl_register(void);
 extern void xfs_sysctl_unregister(void);
 #else
-# define xfs_sysctl_register()          do { } while (0)
+# define xfs_sysctl_register()          (0)
 # define xfs_sysctl_unregister()        do { } while (0)
 #endif /* CONFIG_SYSCTL */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index bc7afe007338..b52528bbbfff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -33,7 +33,7 @@
 /*
- * Dedicated vnode inactive/reclaim sync semaphores.
+ * Dedicated vnode inactive/reclaim sync wait queues.
 * Prime number of hash buckets since address is used as the key.
 */
 #define NVSYNC                  37
@@ -82,74 +82,6 @@ vn_ioerror(
                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
 }
-/*
- * Revalidate the Linux inode from the XFS inode.
- * Note: i_size _not_ updated; we must hold the inode
- * semaphore when doing that - callers responsibility.
- */
-int
-vn_revalidate(
-        bhv_vnode_t             *vp)
-{
-        struct inode            *inode = vn_to_inode(vp);
-        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
-        unsigned long           xflags;
-        xfs_itrace_entry(ip);
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return -EIO;
-        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        inode->i_mode       = ip->i_d.di_mode;
-        inode->i_uid        = ip->i_d.di_uid;
-        inode->i_gid        = ip->i_d.di_gid;
-        inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
-        inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
-        inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
-        inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
-        xflags = xfs_ip2xflags(ip);
-        if (xflags & XFS_XFLAG_IMMUTABLE)
-                inode->i_flags |= S_IMMUTABLE;
-        else
-                inode->i_flags &= ~S_IMMUTABLE;
-        if (xflags & XFS_XFLAG_APPEND)
-                inode->i_flags |= S_APPEND;
-        else
-                inode->i_flags &= ~S_APPEND;
-        if (xflags & XFS_XFLAG_SYNC)
-                inode->i_flags |= S_SYNC;
-        else
-                inode->i_flags &= ~S_SYNC;
-        if (xflags & XFS_XFLAG_NOATIME)
-                inode->i_flags |= S_NOATIME;
-        else
-                inode->i_flags &= ~S_NOATIME;
-        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
-        return 0;
-}
-/*
- * Add a reference to a referenced vnode.
- */
-bhv_vnode_t *
-vn_hold(
-        bhv_vnode_t     *vp)
-{
-        struct inode    *inode;
-        XFS_STATS_INC(vn_hold);
-        inode = igrab(vn_to_inode(vp));
-        ASSERT(inode);
-        return vp;
-}
 #ifdef  XFS_INODE_TRACE
 /*
@@ -158,7 +90,7 @@ vn_hold(
 */
 static inline int xfs_icount(struct xfs_inode *ip)
 {
-        bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
+        struct inode *vp = VFS_I(ip);
        if (vp)
                return vn_count(vp);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 25eb2a9e8d9b..683ce16210ff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -19,24 +19,9 @@
 #define __XFS_VNODE_H__
 struct file;
-struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
-typedef struct inode    bhv_vnode_t;
-/*
- * Vnode to Linux inode mapping.
- */
-static inline bhv_vnode_t *vn_from_inode(struct inode *inode)
-{
-        return inode;
-}
-static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
-{
-        return vnode;
-}
 /*
 * Return values for xfs_inactive.  A return value of
 * VN_INACTIVE_NOCACHE implies that the file system behavior
@@ -66,87 +51,8 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
                                           Prevent VM access to the pages until
                                           the operation completes. */
-/*
- * Vnode attributes.  va_mask indicates those attributes the caller
- * wants to set or extract.
- */
-typedef struct bhv_vattr {
-        int             va_mask;        /* bit-mask of attributes present */
-        mode_t          va_mode;        /* file access mode and type */
-        xfs_nlink_t     va_nlink;       /* number of references to file */
-        uid_t           va_uid;         /* owner user id */
-        gid_t           va_gid;         /* owner group id */
-        xfs_ino_t       va_nodeid;      /* file id */
-        xfs_off_t       va_size;        /* file size in bytes */
-        u_long          va_blocksize;   /* blocksize preferred for i/o */
-        struct timespec va_atime;       /* time of last access */
-        struct timespec va_mtime;       /* time of last modification */
-        struct timespec va_ctime;       /* time file changed */
-        u_int           va_gen;         /* generation number of file */
-        xfs_dev_t       va_rdev;        /* device the special file represents */
-        __int64_t       va_nblocks;     /* number of blocks allocated */
-        u_long          va_xflags;      /* random extended file flags */
-        u_long          va_extsize;     /* file extent size */
-        u_long          va_nextents;    /* number of extents in file */
-        u_long          va_anextents;   /* number of attr extents in file */
-        prid_t          va_projid;      /* project id */
-} bhv_vattr_t;
-/*
- * setattr or getattr attributes
- */
-#define XFS_AT_TYPE             0x00000001
-#define XFS_AT_MODE             0x00000002
-#define XFS_AT_UID              0x00000004
-#define XFS_AT_GID              0x00000008
-#define XFS_AT_FSID             0x00000010
-#define XFS_AT_NODEID           0x00000020
-#define XFS_AT_NLINK            0x00000040
-#define XFS_AT_SIZE             0x00000080
-#define XFS_AT_ATIME            0x00000100
-#define XFS_AT_MTIME            0x00000200
-#define XFS_AT_CTIME            0x00000400
-#define XFS_AT_RDEV             0x00000800
-#define XFS_AT_BLKSIZE          0x00001000
-#define XFS_AT_NBLOCKS          0x00002000
-#define XFS_AT_VCODE            0x00004000
-#define XFS_AT_MAC              0x00008000
-#define XFS_AT_UPDATIME         0x00010000
-#define XFS_AT_UPDMTIME         0x00020000
-#define XFS_AT_UPDCTIME         0x00040000
-#define XFS_AT_ACL              0x00080000
-#define XFS_AT_CAP              0x00100000
-#define XFS_AT_INF              0x00200000
-#define XFS_AT_XFLAGS           0x00400000
-#define XFS_AT_EXTSIZE          0x00800000
-#define XFS_AT_NEXTENTS         0x01000000
-#define XFS_AT_ANEXTENTS        0x02000000
-#define XFS_AT_PROJID           0x04000000
-#define XFS_AT_SIZE_NOPERM      0x08000000
-#define XFS_AT_GENCOUNT         0x10000000
-#define XFS_AT_ALL      (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
-                XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
-                XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
-                XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
-                XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
-                XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
-#define XFS_AT_STAT     (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
-                XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
-                XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
-                XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
-#define XFS_AT_TIMES    (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
-#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
-#define XFS_AT_NOSET    (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
-                XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
-                XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
 extern void     vn_init(void);
-extern int      vn_revalidate(bhv_vnode_t *);
 /*
 * Yeah, these don't take vnode anymore at all, all this should be
@@ -156,57 +62,52 @@ extern void	vn_iowait(struct xfs_inode *ip);
 extern void     vn_iowake(struct xfs_inode *ip);
 extern void     vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(bhv_vnode_t *vp)
+static inline int vn_count(struct inode *vp)
 {
-        return atomic_read(&vn_to_inode(vp)->i_count);
+        return atomic_read(&vp->i_count);
 }
-/*
+#define IHOLD(ip) \
- * Vnode reference counting functions (and macros for compatibility).
+do { \
- */
+        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-extern bhv_vnode_t      *vn_hold(bhv_vnode_t *);
+        atomic_inc(&(VFS_I(ip)->i_count)); \
+        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
-#if defined(XFS_INODE_TRACE)
+#define IRELE(ip) \
-#define VN_HOLD(vp)             \
+do { \
-        ((void)vn_hold(vp),     \
+        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-          xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address))
+        iput(VFS_I(ip)); \
-#define VN_RELE(vp)             \
+} while (0)
-          (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \
-           iput(vn_to_inode(vp)))
-#else
-#define VN_HOLD(vp)             ((void)vn_hold(vp))
-#define VN_RELE(vp)             (iput(vn_to_inode(vp)))
-#endif
-static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
+static inline struct inode *vn_grab(struct inode *vp)
 {
-        struct inode *inode = igrab(vn_to_inode(vp));
+        return igrab(vp);
-        return inode ? vn_from_inode(inode) : NULL;
 }
 /*
 * Dealing with bad inodes
 */
-static inline int VN_BAD(bhv_vnode_t *vp)
+static inline int VN_BAD(struct inode *vp)
 {
-        return is_bad_inode(vn_to_inode(vp));
+        return is_bad_inode(vp);
 }
 /*
 * Extracting atime values in various formats
 */
-static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
 {
        bs_atime->tv_sec = vp->i_atime.tv_sec;
        bs_atime->tv_nsec = vp->i_atime.tv_nsec;
 }
-static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
 {
        *ts = vp->i_atime;
 }
-static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
+static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
 {
        *tt = vp->i_atime.tv_sec;
 }
@@ -214,20 +115,11 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 /*
 * Some useful predicates.
 */
-#define VN_MAPPED(vp)   mapping_mapped(vn_to_inode(vp)->i_mapping)
+#define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)   (vn_to_inode(vp)->i_mapping->nrpages)
+#define VN_CACHED(vp)   (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)    mapping_tagged(vn_to_inode(vp)->i_mapping, \
+#define VN_DIRTY(vp)    mapping_tagged(vp->i_mapping, \
                                        PAGECACHE_TAG_DIRTY)
-/*
- * Flags to vop_setattr/getattr.
- */
-#define ATTR_UTIME      0x01    /* non-default utime(2) request */
-#define ATTR_DMI        0x08    /* invocation from a DMI function */
-#define ATTR_LAZY       0x80    /* set/get attributes lazily */
-#define ATTR_NONBLOCK   0x100   /* return EAGAIN if operation would block */
-#define ATTR_NOLOCK     0x200   /* Don't grab any conflicting locks */
-#define ATTR_NOSIZETOK  0x400   /* Don't get the SIZE token */
 /*
 * Tracking vnode activity.
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
new file mode 100644
index 000000000000..964621fde6ed
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+#include "xfs_vnodeops.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+/*
+ * ACL handling.  Should eventually be moved into xfs_acl.c
+ */
+static int
+xfs_decode_acl(const char *name)
+{
+        if (strcmp(name, "posix_acl_access") == 0)
+                return _ACL_TYPE_ACCESS;
+        else if (strcmp(name, "posix_acl_default") == 0)
+                return _ACL_TYPE_DEFAULT;
+        return -EINVAL;
+}
+/*
+ * Get system extended attributes which at the moment only
+ * includes Posix ACLs.
+ */
+static int
+xfs_xattr_system_get(struct inode *inode, const char *name,
+                void *buffer, size_t size)
+{
+        int acl;
+        acl = xfs_decode_acl(name);
+        if (acl < 0)
+                return acl;
+        return xfs_acl_vget(inode, buffer, size, acl);
+}
+static int
+xfs_xattr_system_set(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
+{
+        int acl;
+        acl = xfs_decode_acl(name);
+        if (acl < 0)
+                return acl;
+        if (flags & XATTR_CREATE)
+                return -EINVAL;
+        if (!value)
+                return xfs_acl_vremove(inode, acl);
+        return xfs_acl_vset(inode, (void *)value, size, acl);
+}
+static struct xattr_handler xfs_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = xfs_xattr_system_get,
+        .set    = xfs_xattr_system_set,
+};
+/*
+ * Real xattr handling.  The only difference between the namespaces is
+ * a flag passed to the low-level attr code.
+ */
+static int
+__xfs_xattr_get(struct inode *inode, const char *name,
+                void *value, size_t size, int xflags)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        int error, asize = size;
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        /* Convert Linux syscall to XFS internal ATTR flags */
+        if (!size) {
+                xflags |= ATTR_KERNOVAL;
+                value = NULL;
+        }
+        error = -xfs_attr_get(ip, name, value, &asize, xflags);
+        if (error)
+                return error;
+        return asize;
+}
+static int
+__xfs_xattr_set(struct inode *inode, const char *name, const void *value,
+                size_t size, int flags, int xflags)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        /* Convert Linux syscall to XFS internal ATTR flags */
+        if (flags & XATTR_CREATE)
+                xflags |= ATTR_CREATE;
+        if (flags & XATTR_REPLACE)
+                xflags |= ATTR_REPLACE;
+        if (!value)
+                return -xfs_attr_remove(ip, name, xflags);
+        return -xfs_attr_set(ip, name, (void *)value, size, xflags);
+}
+static int
+xfs_xattr_user_get(struct inode *inode, const char *name,
+                void *value, size_t size)
+{
+        return __xfs_xattr_get(inode, name, value, size, 0);
+}
+static int
+xfs_xattr_user_set(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
+{
+        return __xfs_xattr_set(inode, name, value, size, flags, 0);
+}
+static struct xattr_handler xfs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .get    = xfs_xattr_user_get,
+        .set    = xfs_xattr_user_set,
+};
+static int
+xfs_xattr_trusted_get(struct inode *inode, const char *name,
+                void *value, size_t size)
+{
+        return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT);
+}
+static int
+xfs_xattr_trusted_set(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
+{
+        return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT);
+}
+static struct xattr_handler xfs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .get    = xfs_xattr_trusted_get,
+        .set    = xfs_xattr_trusted_set,
+};
+static int
+xfs_xattr_secure_get(struct inode *inode, const char *name,
+                void *value, size_t size)
+{
+        return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE);
+}
+static int
+xfs_xattr_secure_set(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
+{
+        return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE);
+}
+static struct xattr_handler xfs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .get    = xfs_xattr_secure_get,
+        .set    = xfs_xattr_secure_set,
+};
+struct xattr_handler *xfs_xattr_handlers[] = {
+        &xfs_xattr_user_handler,
+        &xfs_xattr_trusted_handler,
+        &xfs_xattr_security_handler,
+        &xfs_xattr_system_handler,
+        NULL
+};
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+        if (flags & XFS_ATTR_SECURE)
+                return sizeof("security");
+        else if (flags & XFS_ATTR_ROOT)
+                return sizeof("trusted");
+        else
+                return sizeof("user");
+}
+static const char *xfs_xattr_prefix(int flags)
+{
+        if (flags & XFS_ATTR_SECURE)
+                return xfs_xattr_security_handler.prefix;
+        else if (flags & XFS_ATTR_ROOT)
+                return xfs_xattr_trusted_handler.prefix;
+        else
+                return xfs_xattr_user_handler.prefix;
+}
+static int
+xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
+                char *name, int namelen, int valuelen, char *value)
+{
+        unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+        char *offset;
+        int arraytop;
+        ASSERT(context->count >= 0);
+        /*
+         * Only show root namespace entries if we are actually allowed to
+         * see them.
+         */
+        if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+                return 0;
+        arraytop = context->count + prefix_len + namelen + 1;
+        if (arraytop > context->firstu) {
+                context->count = -1;    /* insufficient space */
+                return 1;
+        }
+        offset = (char *)context->alist + context->count;
+        strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+        offset += prefix_len;
+        strncpy(offset, name, namelen);                 /* real name */
+        offset += namelen;
+        *offset = '\0';
+        context->count += prefix_len + namelen + 1;
+        return 0;
+}
+static int
+xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags,
+                char *name, int namelen, int valuelen, char *value)
+{
+        context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+        return 0;
+}
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+                size_t size, ssize_t *result)
+{
+        char *p = data + *result;
+        *result += len;
+        if (!size)
+                return 0;
+        if (*result > size)
+                return -ERANGE;
+        strcpy(p, name);
+        return 0;
+}
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+        struct xfs_attr_list_context context;
+        struct attrlist_cursor_kern cursor = { 0 };
+        struct inode            *inode = dentry->d_inode;
+        int                     error;
+        /*
+         * First read the regular on-disk attributes.
+         */
+        memset(&context, 0, sizeof(context));
+        context.dp = XFS_I(inode);
+        context.cursor = &cursor;
+        context.resynch = 1;
+        context.alist = data;
+        context.bufsize = size;
+        context.firstu = context.bufsize;
+        if (size)
+                context.put_listent = xfs_xattr_put_listent;
+        else
+                context.put_listent = xfs_xattr_put_listent_sizes;
+        xfs_attr_list_int(&context);
+        if (context.count < 0)
+                return -ERANGE;
+        /*
+         * Then add the two synthetic ACL attributes.
+         */
+        if (xfs_acl_vhasacl_access(inode)) {
+                error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+                                strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+                                data, size, &context.count);
+                if (error)
+                        return error;
+        }
+        if (xfs_acl_vhasacl_default(inode)) {
+                error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+                                strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+                                data, size, &context.count);
+                if (error)
+                        return error;
+        }
+        return context.count;
+}
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 85df3288efd5..f2705f2fd43c 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,11 +101,18 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-                initnsema(&dqp->q_flock, 1, "fdq");
                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+                /*
+                 * Because we want to use a counting completion, complete
+                 * the flush completion once to allow a single access to
+                 * the flush completion without blocking.
+                 */
+                init_completion(&dqp->q_flush);
+                complete(&dqp->q_flush);
 #ifdef XFS_DQUOT_TRACE
-                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
                xfs_dqtrace_entry(dqp, "DQINIT");
 #endif
        } else {
@@ -150,7 +157,6 @@ xfs_qm_dqdestroy(
        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
        mutex_destroy(&dqp->q_qlock);
-        freesema(&dqp->q_flock);
        sv_destroy(&dqp->q_pinwait);
 #ifdef XFS_DQUOT_TRACE
@@ -431,7 +437,7 @@ xfs_qm_dqalloc(
         * when it unlocks the inode. Since we want to keep the quota
         * inode around, we bump the vnode ref count now.
         */
-        VN_HOLD(XFS_ITOV(quotip));
+        IHOLD(quotip);
        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
        nmaps = 1;
@@ -1211,7 +1217,7 @@ xfs_qm_dqflush(
        int                     error;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        ASSERT(!completion_done(&dqp->q_flush));
        xfs_dqtrace_entry(dqp, "DQFLUSH");
        /*
@@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done(
        xfs_dqfunlock(dqp);
 }
-int
-xfs_qm_dqflock_nowait(
-        xfs_dquot_t *dqp)
-{
-        int locked;
-        locked = cpsema(&((dqp)->q_flock));
-        /* XXX ifdef these out */
-        if (locked)
-                (dqp)->dq_flags |= XFS_DQ_FLOCKED;
-        return (locked);
-}
 int
 xfs_qm_dqlock_nowait(
        xfs_dquot_t *dqp)
 {
-        return (mutex_trylock(&((dqp)->q_qlock)));
+        return mutex_trylock(&dqp->q_qlock);
 }
 void
 xfs_dqlock(
        xfs_dquot_t *dqp)
 {
-        mutex_lock(&(dqp->q_qlock));
+        mutex_lock(&dqp->q_qlock);
 }
 void
@@ -1435,8 +1425,7 @@ xfs_dqlock2(
 /* ARGSUSED */
 int
 xfs_qm_dqpurge(
-        xfs_dquot_t     *dqp,
+        xfs_dquot_t     *dqp)
-        uint            flags)
 {
        xfs_dqhash_t    *thishash;
        xfs_mount_t     *mp = dqp->q_mount;
@@ -1469,7 +1458,7 @@ xfs_qm_dqpurge(
         * if we're turning off quotas. Basically, we need this flush
         * lock, and are willing to block on it.
         */
-        if (! xfs_qm_dqflock_nowait(dqp)) {
+        if (!xfs_dqflock_nowait(dqp)) {
                /*
                 * Block on the flush lock after nudging dquot buffer,
                 * if it is incore.
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 5c371a92e3e2..8958d0faf8d3 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -82,7 +82,7 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
-        sema_t           q_flock;       /* flush lock */
+        struct completion q_flush;      /* flush completion queue */
        uint             q_pincount;    /* pin count for this dquot */
        sv_t             q_pinwait;     /* sync var for pinning */
 #ifdef XFS_DQUOT_TRACE
@@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 /*
- * The following three routines simply manage the q_flock
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
- * semaphore embedded in the dquot.  This semaphore synchronizes
+ * queue synchronizes processes attempting to flush the in-core dquot back to
- * processes attempting to flush the in-core dquot back to disk.
+ * disk.
 */
-#define xfs_dqflock(dqp)         { psema(&((dqp)->q_flock), PINOD | PRECALC);\
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
-                                   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
+{
-#define xfs_dqfunlock(dqp)       { ASSERT(issemalocked(&((dqp)->q_flock))); \
+        wait_for_completion(&dqp->q_flush);
-                                   vsema(&((dqp)->q_flock)); \
+}
-                                   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+        return try_wait_for_completion(&dqp->q_flush);
+}
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+        complete(&dqp->q_flush);
+}
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
@@ -164,10 +172,9 @@ extern void		xfs_qm_dqprint(xfs_dquot_t *);
 extern void             xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int              xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int              xfs_qm_dqpurge(xfs_dquot_t *, uint);
+extern int              xfs_qm_dqpurge(xfs_dquot_t *);
 extern void             xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern int              xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int              xfs_qm_dqflock_nowait(xfs_dquot_t *);
 extern void             xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
 extern void             xfs_qm_adjust_dqtimers(xfs_mount_t *,
                                        xfs_disk_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 36e05ca78412..f028644caa5e 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push(
        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        ASSERT(!completion_done(&dqp->q_flush));
        /*
         * Since we were able to lock the dquot's flush lock and
@@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf(
         * inode flush completed and the inode was taken off the AIL.
         * So, just get out.
         */
-        if (!issemalocked(&(dqp->q_flock))  ||
+        if (completion_done(&dqp->q_flush)  ||
            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                qip->qli_pushbuf_flag = 0;
                xfs_dqunlock(dqp);
@@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf(
        if (bp != NULL) {
                if (XFS_BUF_ISDELAYWRITE(bp)) {
                        dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-                                  issemalocked(&(dqp->q_flock)));
+                                  !completion_done(&dqp->q_flush));
                        qip->qli_pushbuf_flag = 0;
                        xfs_dqunlock(dqp);
@@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock(
                return (XFS_ITEM_LOCKED);
        retval = XFS_ITEM_SUCCESS;
-        if (! xfs_qm_dqflock_nowait(dqp)) {
+        if (!xfs_dqflock_nowait(dqp)) {
                /*
                 * The dquot is already being flushed.  It may have been
                 * flushed delayed write, however, and we don't want to
@@ -576,8 +576,8 @@ xfs_qm_qoffend_logitem_committed(
         * xfs_trans_delete_ail() drops the AIL lock.
         */
        xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
-        kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
+        kmem_free(qfs);
-        kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
+        kmem_free(qfe);
        return (xfs_lsn_t)-1;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index d31cce1165c5..df0ffef9775a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -192,8 +192,8 @@ xfs_qm_destroy(
                xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
                xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
        }
-        kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
+        kmem_free(xqm->qm_usr_dqhtable);
-        kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
+        kmem_free(xqm->qm_grp_dqhtable);
        xqm->qm_usr_dqhtable = NULL;
        xqm->qm_grp_dqhtable = NULL;
        xqm->qm_dqhashmask = 0;
@@ -201,7 +201,7 @@ xfs_qm_destroy(
 #ifdef DEBUG
        mutex_destroy(&qcheck_lock);
 #endif
-        kmem_free(xqm, sizeof(xfs_qm_t));
+        kmem_free(xqm);
 }
 /*
@@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy(
 */
 void
 xfs_qm_mount_quotas(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             mfsi_flags)
 {
        int             error = 0;
        uint            sbf;
@@ -346,8 +345,7 @@ xfs_qm_mount_quotas(
        /*
         * If any of the quotas are not consistent, do a quotacheck.
         */
-        if (XFS_QM_NEED_QUOTACHECK(mp) &&
+        if (XFS_QM_NEED_QUOTACHECK(mp)) {
-            !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
                error = xfs_qm_quotacheck(mp);
                if (error) {
                        /* Quotacheck failed and disabled quotas. */
@@ -445,11 +443,11 @@ xfs_qm_unmount_quotas(
                }
        }
        if (uqp) {
-                 XFS_PURGE_INODE(uqp);
+                 IRELE(uqp);
                 mp->m_quotainfo->qi_uquotaip = NULL;
        }
        if (gqp) {
-                XFS_PURGE_INODE(gqp);
+                IRELE(gqp);
                mp->m_quotainfo->qi_gquotaip = NULL;
        }
 out:
@@ -484,7 +482,7 @@ again:
                xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        /*
                         * If we can't grab the flush lock then check
                         * to see if the dquot has been flushed delayed
@@ -631,7 +629,7 @@ xfs_qm_dqpurge_int(
                 * freelist in INACTIVE state.
                 */
                nextdqp = dqp->MPL_NEXT;
-                nmisses += xfs_qm_dqpurge(dqp, flags);
+                nmisses += xfs_qm_dqpurge(dqp);
                dqp = nextdqp;
        }
        xfs_qm_mplist_unlock(mp);
@@ -1062,7 +1060,7 @@ xfs_qm_sync(
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        if (nowait) {
                                xfs_dqunlock(dqp);
                                continue;
@@ -1134,7 +1132,7 @@ xfs_qm_init_quotainfo(
         * and change the superblock accordingly.
         */
        if ((error = xfs_qm_init_quotainos(mp))) {
-                kmem_free(qinf, sizeof(xfs_quotainfo_t));
+                kmem_free(qinf);
                mp->m_quotainfo = NULL;
                return error;
        }
@@ -1240,15 +1238,15 @@ xfs_qm_destroy_quotainfo(
        xfs_qm_list_destroy(&qi->qi_dqlist);
        if (qi->qi_uquotaip) {
-                XFS_PURGE_INODE(qi->qi_uquotaip);
+                IRELE(qi->qi_uquotaip);
                qi->qi_uquotaip = NULL; /* paranoia */
        }
        if (qi->qi_gquotaip) {
-                XFS_PURGE_INODE(qi->qi_gquotaip);
+                IRELE(qi->qi_gquotaip);
                qi->qi_gquotaip = NULL;
        }
        mutex_destroy(&qi->qi_quotaofflock);
-        kmem_free(qi, sizeof(xfs_quotainfo_t));
+        kmem_free(qi);
        mp->m_quotainfo = NULL;
 }
@@ -1394,7 +1392,7 @@ xfs_qm_qino_alloc(
         * locked exclusively and joined to the transaction already.
         */
        ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-        VN_HOLD(XFS_ITOV((*ip)));
+        IHOLD(*ip);
        /*
         * Make the changes in the superblock, and log those too.
@@ -1623,7 +1621,7 @@ xfs_qm_dqiterate(
                        break;
        } while (nmaps > 0);
-        kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
+        kmem_free(map);
        return error;
 }
@@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist(
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
                        dqp = dqp->dq_flnext;
                        continue;
@@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void)
                 * Try to grab the flush lock. If this dquot is in the process of
                 * getting flushed to disk, we don't want to reclaim it.
                 */
-                if (! xfs_qm_dqflock_nowait(dqp)) {
+                if (!xfs_dqflock_nowait(dqp)) {
                        xfs_dqunlock(dqp);
                        continue;
                }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index cd2300e374af..44f25349e478 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void             xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f4f6c4c861d7..eea2e60b456b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -162,7 +162,7 @@ xfs_qm_newmount(
                         * mounting, and get on with the boring life
                         * without disk quotas.
                         */
-                        xfs_qm_mount_quotas(mp, 0);
+                        xfs_qm_mount_quotas(mp);
                } else {
                        /*
                         * Clear the quota flags, but remember them. This
@@ -184,13 +184,12 @@ STATIC int
 xfs_qm_endmount(
        xfs_mount_t     *mp,
        uint            needquotamount,
-        uint            quotaflags,
+        uint            quotaflags)
-        int             mfsi_flags)
 {
        if (needquotamount) {
                ASSERT(mp->m_qflags == 0);
                mp->m_qflags = quotaflags;
-                xfs_qm_mount_quotas(mp, mfsi_flags);
+                xfs_qm_mount_quotas(mp);
        }
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 768a3b27d2b6..1a3b803dfa55 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -362,11 +362,11 @@ xfs_qm_scall_quotaoff(
         * if we don't need them anymore.
         */
        if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
-                XFS_PURGE_INODE(XFS_QI_UQIP(mp));
+                IRELE(XFS_QI_UQIP(mp));
                XFS_QI_UQIP(mp) = NULL;
        }
        if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
-                XFS_PURGE_INODE(XFS_QI_GQIP(mp));
+                IRELE(XFS_QI_GQIP(mp));
                XFS_QI_GQIP(mp) = NULL;
        }
 out_error:
@@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes(
 {
        xfs_inode_t     *ip, *topino;
        uint            ireclaims;
-        bhv_vnode_t     *vp;
+        struct inode    *vp;
        boolean_t       vnode_refd;
        ASSERT(mp->m_quotainfo);
@@ -1059,7 +1059,7 @@ again:
                        ip = ip->i_mnext;
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                vp = VFS_I(ip);
                if (!vp) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
@@ -1449,14 +1449,14 @@ xfs_qm_internalqcheck(
                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
                        xfs_dqtest_cmp(d);
                        e = (xfs_dqtest_t *) d->HL_NEXT;
-                        kmem_free(d, sizeof(xfs_dqtest_t));
+                        kmem_free(d);
                        d = e;
                }
                h1 = &qmtest_gdqtab[i];
                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
                        xfs_dqtest_cmp(d);
                        e = (xfs_dqtest_t *) d->HL_NEXT;
-                        kmem_free(d, sizeof(xfs_dqtest_t));
+                        kmem_free(d);
                        d = e;
                }
        }
@@ -1467,8 +1467,8 @@ xfs_qm_internalqcheck(
        } else {
                cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
        }
-        kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+        kmem_free(qmtest_udqtab);
-        kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+        kmem_free(qmtest_gdqtab);
        mutex_unlock(&qcheck_lock);
        return (qmtest_nfails);
 }
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 5e4a40b1c565..c4fcea600bc2 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -158,9 +158,6 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
 #define XFS_IS_SUSER_DQUOT(dqp)         \
        (!((dqp)->q_core.d_id))
-#define XFS_PURGE_INODE(ip)             \
-        IRELE(ip);
 #define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 0b75d302508f..a34ef05489b1 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -89,7 +89,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
                if (sleep & KM_SLEEP)
                        panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-                kmem_free(ktp, sizeof(*ktp));
+                kmem_free(ktp);
                return NULL;
        }
@@ -126,7 +126,7 @@ ktrace_free(ktrace_t *ktp)
        } else {
                entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
-                kmem_free(ktp->kt_entries, entries_size);
+                kmem_free(ktp->kt_entries);
        }
        kmem_zone_free(ktrace_hdr_zone, ktp);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 493a6ecf8590..5830c040ea7e 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,7 +17,7 @@
 */
 #include <xfs.h>
-static mutex_t  uuid_monitor;
+static DEFINE_MUTEX(uuid_monitor);
 static int      uuid_table_size;
 static uuid_t   *uuid_table;
@@ -132,9 +132,3 @@ uuid_table_remove(uuid_t *uuid)
        ASSERT(i < uuid_table_size);
        mutex_unlock(&uuid_monitor);
 }
-void __init
-uuid_init(void)
-{
-        mutex_init(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index b6f5922199ba..cff5b607d445 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,7 +22,6 @@ typedef struct {
        unsigned char   __u_bits[16];
 } uuid_t;
-extern void uuid_init(void);
 extern void uuid_create_nil(uuid_t *uuid);
 extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index ebee3a4f703a..b2f639a1416f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,15 +37,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
-STATIC int      xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
+STATIC int      xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void     xfs_acl_get_endian(xfs_acl_t *);
 STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int      xfs_acl_invalid(xfs_acl_t *);
 STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void     xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
-STATIC void     xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
+STATIC void     xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
-STATIC int      xfs_acl_allow_set(bhv_vnode_t *, int);
+STATIC int      xfs_acl_allow_set(struct inode *, int);
 kmem_zone_t *xfs_acl_zone;
@@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
 */
 int
 xfs_acl_vhasacl_access(
-        bhv_vnode_t     *vp)
+        struct inode    *vp)
 {
        int             error;
@@ -68,7 +68,7 @@ xfs_acl_vhasacl_access(
 */
 int
 xfs_acl_vhasacl_default(
-        bhv_vnode_t     *vp)
+        struct inode    *vp)
 {
        int             error;
@@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr(
 int
 xfs_acl_vget(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        void            *acl,
        size_t          size,
        int             kind)
@@ -217,7 +217,6 @@ xfs_acl_vget(
        posix_acl_xattr_header  *ext_acl = acl;
        int                     flags = 0;
-        VN_HOLD(vp);
        if(size) {
                if (!(_ACL_ALLOC(xfs_acl))) {
                        error = ENOMEM;
@@ -239,11 +238,10 @@ xfs_acl_vget(
                        goto out;
                }
                if (kind == _ACL_TYPE_ACCESS)
-                        xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl);
+                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
                error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
        }
 out:
-        VN_RELE(vp);
        if(xfs_acl)
                _ACL_FREE(xfs_acl);
        return -error;
@@ -251,28 +249,26 @@ out:
 int
 xfs_acl_vremove(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        int             kind)
 {
        int             error;
-        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        if (!error) {
-                error = xfs_attr_remove(xfs_vtoi(vp),
+                error = xfs_attr_remove(XFS_I(vp),
                                                kind == _ACL_TYPE_DEFAULT?
                                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
                                                ATTR_ROOT);
                if (error == ENOATTR)
                        error = 0;      /* 'scool */
        }
-        VN_RELE(vp);
        return -error;
 }
 int
 xfs_acl_vset(
-        bhv_vnode_t             *vp,
+        struct inode            *vp,
        void                    *acl,
        size_t                  size,
        int                     kind)
@@ -298,7 +294,6 @@ xfs_acl_vset(
                return 0;
        }
-        VN_HOLD(vp);
        error = xfs_acl_allow_set(vp, kind);
        /* Incoming ACL exists, set file mode based on its value */
@@ -321,7 +316,6 @@ xfs_acl_vset(
        }
 out:
-        VN_RELE(vp);
        _ACL_FREE(xfs_acl);
        return -error;
 }
@@ -341,8 +335,7 @@ xfs_acl_iaccess(
        /* If the file has no ACL return -1. */
        rval = sizeof(xfs_acl_t);
-        if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval,
+        if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
-                                        ATTR_ROOT | ATTR_KERNACCESS)) {
                _ACL_FREE(acl);
                return -1;
        }
@@ -364,7 +357,7 @@ xfs_acl_iaccess(
 STATIC int
 xfs_acl_allow_set(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        int             kind)
 {
        if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
@@ -373,7 +366,7 @@ xfs_acl_allow_set(
                return ENOTDIR;
        if (vp->i_sb->s_flags & MS_RDONLY)
                return EROFS;
-        if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+        if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
                return EPERM;
        return 0;
 }
@@ -567,7 +560,7 @@ xfs_acl_get_endian(
 */
 STATIC void
 xfs_acl_get_attr(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             flags,
@@ -577,7 +570,7 @@ xfs_acl_get_attr(
        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
        flags |= ATTR_ROOT;
-        *error = xfs_attr_get(xfs_vtoi(vp),
+        *error = xfs_attr_get(XFS_I(vp),
                                        kind == _ACL_TYPE_ACCESS ?
                                        SGI_ACL_FILE : SGI_ACL_DEFAULT,
                                        (char *)aclp, &len, flags);
@@ -591,7 +584,7 @@ xfs_acl_get_attr(
 */
 STATIC void
 xfs_acl_set_attr(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *aclp,
        int             kind,
        int             *error)
@@ -616,7 +609,7 @@ xfs_acl_set_attr(
                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
        }
        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        *error = xfs_attr_set(xfs_vtoi(vp),
+        *error = xfs_attr_set(XFS_I(vp),
                                kind == _ACL_TYPE_ACCESS ?
                                SGI_ACL_FILE: SGI_ACL_DEFAULT,
                                (char *)newacl, len, ATTR_ROOT);
@@ -625,7 +618,7 @@ xfs_acl_set_attr(
 int
 xfs_acl_vtoacl(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *access_acl,
        xfs_acl_t       *default_acl)
 {
@@ -640,7 +633,7 @@ xfs_acl_vtoacl(
                if (error)
                        access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
                else /* We have a good ACL and the file mode, synchronize. */
-                        xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl);
+                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
        }
        if (default_acl) {
@@ -657,7 +650,7 @@ xfs_acl_vtoacl(
 */
 int
 xfs_acl_inherit(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        mode_t          mode,
        xfs_acl_t       *pdaclp)
 {
@@ -716,11 +709,11 @@ out_error:
 */
 STATIC int
 xfs_acl_setmode(
-        bhv_vnode_t     *vp,
+        struct inode    *vp,
        xfs_acl_t       *acl,
        int             *basicperms)
 {
-        bhv_vattr_t     va;
+        struct iattr    iattr;
        xfs_acl_entry_t *ap;
        xfs_acl_entry_t *gap = NULL;
        int             i, nomask = 1;
@@ -734,25 +727,25 @@ xfs_acl_setmode(
         * Copy the u::, g::, o::, and m:: bits from the ACL into the
         * mode.  The m:: bits take precedence over the g:: bits.
         */
-        va.va_mask = XFS_AT_MODE;
+        iattr.ia_valid = ATTR_MODE;
-        va.va_mode = xfs_vtoi(vp)->i_d.di_mode;
+        iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
-        va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
+        iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
        ap = acl->acl_entry;
        for (i = 0; i < acl->acl_cnt; ++i) {
                switch (ap->ae_tag) {
                case ACL_USER_OBJ:
-                        va.va_mode |= ap->ae_perm << 6;
+                        iattr.ia_mode |= ap->ae_perm << 6;
                        break;
                case ACL_GROUP_OBJ:
                        gap = ap;
                        break;
                case ACL_MASK:  /* more than just standard modes */
                        nomask = 0;
-                        va.va_mode |= ap->ae_perm << 3;
+                        iattr.ia_mode |= ap->ae_perm << 3;
                        *basicperms = 0;
                        break;
                case ACL_OTHER:
-                        va.va_mode |= ap->ae_perm;
+                        iattr.ia_mode |= ap->ae_perm;
                        break;
                default:        /* more than just standard modes */
                        *basicperms = 0;
@@ -763,9 +756,9 @@ xfs_acl_setmode(
        /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
        if (gap && nomask)
-                va.va_mode |= gap->ae_perm << 3;
+                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(xfs_vtoi(vp), &va, 0, sys_cred);
+        return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
 }
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 332a772461c4..a4e293b93efa 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -46,6 +46,8 @@ typedef struct xfs_acl {
 #define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
+#define _ACL_TYPE_ACCESS        1
+#define _ACL_TYPE_DEFAULT       2
 #ifdef CONFIG_XFS_POSIX_ACL
@@ -57,17 +59,15 @@ extern struct kmem_zone *xfs_acl_zone;
                (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)      kmem_zone_destroy(zone)
-extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *);
+extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(bhv_vnode_t *);
+extern int xfs_acl_vhasacl_access(struct inode *);
-extern int xfs_acl_vhasacl_default(bhv_vnode_t *);
+extern int xfs_acl_vhasacl_default(struct inode *);
-extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int);
+extern int xfs_acl_vset(struct inode *, void *, size_t, int);
-extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int);
+extern int xfs_acl_vget(struct inode *, void *, size_t, int);
-extern int xfs_acl_vremove(bhv_vnode_t *, int);
+extern int xfs_acl_vremove(struct inode *, int);
-#define _ACL_TYPE_ACCESS        1
-#define _ACL_TYPE_DEFAULT       2
 #define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
 #define _ACL_INHERIT(c,m,d)     (xfs_acl_inherit(c,m,d))
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f9472a2076d4..0b3b5efe848c 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -92,16 +92,6 @@
        ((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
    }
-/* define generic INT_ macros */
-#define INT_GET(reference,arch) \
-    (((arch) == ARCH_NOCONVERT) \
-        ? \
-            (reference) \
-        : \
-            INT_SWAP((reference),(reference)) \
-    )
 /* does not return a value */
 #define INT_SET(reference,arch,valueref) \
    (__builtin_constant_p(valueref) ? \
@@ -112,64 +102,6 @@
        ) \
    )
-/* does not return a value */
-#define INT_MOD_EXPR(reference,arch,code) \
-    (((arch) == ARCH_NOCONVERT) \
-        ? \
-            (void)((reference) code) \
-        : \
-            (void)( \
-                (reference) = INT_GET((reference),arch) , \
-                ((reference) code), \
-                INT_SET(reference, arch, reference) \
-            ) \
-    )
-/* does not return a value */
-#define INT_MOD(reference,arch,delta) \
-    (void)( \
-        INT_MOD_EXPR(reference,arch,+=(delta)) \
-    )
-/*
- * INT_COPY - copy a value between two locations with the
- *            _same architecture_ but _potentially different sizes_
- *
- *          if the types of the two parameters are equal or they are
- *              in native architecture, a simple copy is done
- *
- *          otherwise, architecture conversions are done
- *
- */
-/* does not return a value */
-#define INT_COPY(dst,src,arch) \
-    ( \
-        ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
-            ? \
-                (void)((dst) = (src)) \
-            : \
-                INT_SET(dst, arch, INT_GET(src, arch)) \
-    )
-/*
- * INT_XLATE - copy a value in either direction between two locations
- *             with different architectures
- *
- *                  dir < 0     - copy from memory to buffer (native to arch)
- *                  dir > 0     - copy from buffer to memory (arch to native)
- */
-/* does not return a value */
-#define INT_XLATE(buf,mem,dir,arch) {\
-    ASSERT(dir); \
-    if (dir>0) { \
-        (mem)=INT_GET(buf, arch); \
-    } else { \
-        INT_SET(buf, arch, mem); \
-    } \
-}
 /*
 * In directories inode numbers are stored as unaligned arrays of unsigned
 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index df151a859186..f7cdc28aff41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -16,8 +16,6 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/capability.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
@@ -57,11 +55,6 @@
 * Provide the external interfaces to manage attribute lists.
 */
-#define ATTR_SYSCOUNT   2
-static struct attrnames posix_acl_access;
-static struct attrnames posix_acl_default;
-static struct attrnames *attr_system_names[ATTR_SYSCOUNT];
 /*========================================================================
 * Function prototypes for the kernel.
 *========================================================================*/
@@ -116,6 +109,17 @@ xfs_attr_name_to_xname(
        return 0;
 }
+STATIC int
+xfs_inode_hasattr(
+        struct xfs_inode        *ip)
+{
+        if (!XFS_IFORK_Q(ip) ||
+            (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             ip->i_d.di_anextents == 0))
+                return 0;
+        return 1;
+}
 /*========================================================================
 * Overall external interface routines.
 *========================================================================*/
@@ -127,10 +131,8 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
        xfs_da_args_t   args;
        int             error;
-        if ((XFS_IFORK_Q(ip) == 0) ||
+        if (!xfs_inode_hasattr(ip))
-            (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+                return ENOATTR;
-             ip->i_d.di_anextents == 0))
-                return(ENOATTR);
        /*
         * Fill in the arg structure for this request.
@@ -148,11 +150,7 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
        /*
         * Decide on what work routines to call based on the inode size.
         */
-        if (XFS_IFORK_Q(ip) == 0 ||
+        if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-            (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-             ip->i_d.di_anextents == 0)) {
-                error = XFS_ERROR(ENOATTR);
-        } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
                error = xfs_attr_shortform_getvalue(&args);
        } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
                error = xfs_attr_leaf_get(&args);
@@ -196,6 +194,46 @@ xfs_attr_get(
        return(error);
 }
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+        struct xfs_inode        *ip,
+        int                     namelen,
+        int                     valuelen,
+        int                     *local)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     size;
+        int                     nblks;
+        /*
+         * Determine space new attribute will use, and if it would be
+         * "local" or "remote" (note: local != inline).
+         */
+        size = xfs_attr_leaf_newentsize(namelen, valuelen,
+                                        mp->m_sb.sb_blocksize, local);
+        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+        if (*local) {
+                if (size > (mp->m_sb.sb_blocksize >> 1)) {
+                        /* Double split possible */
+                        nblks *= 2;
+                }
+        } else {
+                /*
+                 * Out of line attribute, cannot double split, but
+                 * make room for the attribute value itself.
+                 */
+                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
+                nblks += dblocks;
+                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+        }
+        return nblks;
+}
 STATIC int
 xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                char *value, int valuelen, int flags)
@@ -204,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        xfs_fsblock_t   firstblock;
        xfs_bmap_free_t flist;
        int             error, err2, committed;
-        int             local, size;
-        uint            nblks;
        xfs_mount_t     *mp = dp->i_mount;
        int             rsvd = (flags & ATTR_ROOT) != 0;
+        int             local;
        /*
         * Attach the dquots to the inode.
@@ -241,33 +278,10 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        args.firstblock = &firstblock;
        args.flist = &flist;
        args.whichfork = XFS_ATTR_FORK;
-        args.addname = 1;
+        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-        args.oknoent = 1;
-        /*
-         * Determine space new attribute will use, and if it would be
-         * "local" or "remote" (note: local != inline).
-         */
-        size = xfs_attr_leaf_newentsize(name->len, valuelen,
-                                        mp->m_sb.sb_blocksize, &local);
-        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-        if (local) {
-                if (size > (mp->m_sb.sb_blocksize >> 1)) {
-                        /* Double split possible */
-                        nblks <<= 1;
-                }
-        } else {
-                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
-                /* Out of line attribute, cannot double split, but make
-                 * room for the attribute value itself.
-                 */
-                nblks += dblocks;
-                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
-        }
        /* Size is now blocks for attribute data */
-        args.total = nblks;
+        args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
        /*
         * Start our first transaction of the day.
@@ -289,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        if (rsvd)
                args.trans->t_flags |= XFS_TRANS_RESERVE;
-        if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
+        if ((error = xfs_trans_reserve(args.trans, args.total,
-                                      XFS_ATTRSET_LOG_RES(mp, nblks),
+                        XFS_ATTRSET_LOG_RES(mp, args.total), 0,
-                                      0, XFS_TRANS_PERM_LOG_RES,
+                        XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
-                                      XFS_ATTRSET_LOG_COUNT))) {
                xfs_trans_cancel(args.trans, 0);
                return(error);
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
+        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
-                         rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-                                XFS_QMOPT_RES_REGBLKS);
+                                       XFS_QMOPT_RES_REGBLKS);
        if (error) {
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
                xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
@@ -387,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
                 * Commit the leaf transformation.  We'll need another (linked)
                 * transaction to add the new attribute to the leaf.
                 */
-                if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+                error = xfs_trans_roll(&args.trans, dp);
+                if (error)
                        goto out;
        }
@@ -529,9 +544,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        /*
         * Decide on what work routines to call based on the inode size.
         */
-        if (XFS_IFORK_Q(dp) == 0 ||
+        if (!xfs_inode_hasattr(dp)) {
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-             dp->i_d.di_anextents == 0)) {
                error = XFS_ERROR(ENOATTR);
                goto out;
        }
@@ -601,29 +614,33 @@ xfs_attr_remove(
                return error;
        xfs_ilock(dp, XFS_ILOCK_SHARED);
-        if (XFS_IFORK_Q(dp) == 0 ||
+        if (!xfs_inode_hasattr(dp)) {
-                   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-                    dp->i_d.di_anextents == 0)) {
                xfs_iunlock(dp, XFS_ILOCK_SHARED);
-                return(XFS_ERROR(ENOATTR));
+                return XFS_ERROR(ENOATTR);
        }
        xfs_iunlock(dp, XFS_ILOCK_SHARED);
        return xfs_attr_remove_int(dp, &xname, flags);
 }
-STATIC int
+int
 xfs_attr_list_int(xfs_attr_list_context_t *context)
 {
        int error;
        xfs_inode_t *dp = context->dp;
+        XFS_STATS_INC(xs_attr_list);
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+                return EIO;
+        xfs_ilock(dp, XFS_ILOCK_SHARED);
+        xfs_attr_trace_l_c("syscall start", context);
        /*
         * Decide on what work routines to call based on the inode size.
         */
-        if (XFS_IFORK_Q(dp) == 0 ||
+        if (!xfs_inode_hasattr(dp)) {
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-             dp->i_d.di_anextents == 0)) {
                error = 0;
        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
                error = xfs_attr_shortform_list(context);
@@ -632,6 +649,10 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
        } else {
                error = xfs_attr_node_list(context);
        }
+        xfs_iunlock(dp, XFS_ILOCK_SHARED);
+        xfs_attr_trace_l_c("syscall end", context);
        return error;
 }
@@ -648,74 +669,50 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
 */
 /*ARGSUSED*/
 STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp,
+xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
                     char *name, int namelen,
                     int valuelen, char *value)
 {
+        struct attrlist *alist = (struct attrlist *)context->alist;
        attrlist_ent_t *aep;
        int arraytop;
        ASSERT(!(context->flags & ATTR_KERNOVAL));
        ASSERT(context->count >= 0);
        ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
-        ASSERT(context->firstu >= sizeof(*context->alist));
+        ASSERT(context->firstu >= sizeof(*alist));
        ASSERT(context->firstu <= context->bufsize);
-        arraytop = sizeof(*context->alist) +
+        /*
-                        context->count * sizeof(context->alist->al_offset[0]);
+         * Only list entries in the right namespace.
+         */
+        if (((context->flags & ATTR_SECURE) == 0) !=
+            ((flags & XFS_ATTR_SECURE) == 0))
+                return 0;
+        if (((context->flags & ATTR_ROOT) == 0) !=
+            ((flags & XFS_ATTR_ROOT) == 0))
+                return 0;
+        arraytop = sizeof(*alist) +
+                        context->count * sizeof(alist->al_offset[0]);
        context->firstu -= ATTR_ENTSIZE(namelen);
        if (context->firstu < arraytop) {
                xfs_attr_trace_l_c("buffer full", context);
-                context->alist->al_more = 1;
+                alist->al_more = 1;
                context->seen_enough = 1;
                return 1;
        }
-        aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+        aep = (attrlist_ent_t *)&context->alist[context->firstu];
        aep->a_valuelen = valuelen;
        memcpy(aep->a_name, name, namelen);
-        aep->a_name[ namelen ] = 0;
+        aep->a_name[namelen] = 0;
-        context->alist->al_offset[ context->count++ ] = context->firstu;
+        alist->al_offset[context->count++] = context->firstu;
-        context->alist->al_count = context->count;
+        alist->al_count = context->count;
        xfs_attr_trace_l_c("add", context);
        return 0;
 }
-STATIC int
-xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp,
-                     char *name, int namelen,
-                     int valuelen, char *value)
-{
-        char *offset;
-        int arraytop;
-        ASSERT(context->count >= 0);
-        arraytop = context->count + namesp->attr_namelen + namelen + 1;
-        if (arraytop > context->firstu) {
-                context->count = -1;    /* insufficient space */
-                return 1;
-        }
-        offset = (char *)context->alist + context->count;
-        strncpy(offset, namesp->attr_name, namesp->attr_namelen);
-        offset += namesp->attr_namelen;
-        strncpy(offset, name, namelen);                 /* real name */
-        offset += namelen;
-        *offset = '\0';
-        context->count += namesp->attr_namelen + namelen + 1;
-        return 0;
-}
-/*ARGSUSED*/
-STATIC int
-xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp,
-                     char *name, int namelen,
-                     int valuelen, char *value)
-{
-        context->count += namesp->attr_namelen + namelen + 1;
-        return 0;
-}
 /*
 * Generate a list of extended attribute names and optionally
 * also value lengths.  Positive return value follows the XFS
@@ -732,10 +729,9 @@ xfs_attr_list(
        attrlist_cursor_kern_t *cursor)
 {
        xfs_attr_list_context_t context;
+        struct attrlist *alist;
        int error;
-        XFS_STATS_INC(xs_attr_list);
        /*
         * Validate the cursor.
         */
@@ -756,52 +752,23 @@ xfs_attr_list(
        /*
         * Initialize the output buffer.
         */
+        memset(&context, 0, sizeof(context));
        context.dp = dp;
        context.cursor = cursor;
-        context.count = 0;
-        context.dupcnt = 0;
        context.resynch = 1;
        context.flags = flags;
-        context.seen_enough = 0;
+        context.alist = buffer;
-        context.alist = (attrlist_t *)buffer;
+        context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
-        context.put_value = 0;
+        context.firstu = context.bufsize;
+        context.put_listent = xfs_attr_put_listent;
-        if (flags & ATTR_KERNAMELS) {
-                context.bufsize = bufsize;
-                context.firstu = context.bufsize;
-                if (flags & ATTR_KERNOVAL)
-                        context.put_listent = xfs_attr_kern_list_sizes;
-                else
-                        context.put_listent = xfs_attr_kern_list;
-        } else {
-                context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
-                context.firstu = context.bufsize;
-                context.alist->al_count = 0;
-                context.alist->al_more = 0;
-                context.alist->al_offset[0] = context.bufsize;
-                context.put_listent = xfs_attr_put_listent;
-        }
-        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return EIO;
-        xfs_ilock(dp, XFS_ILOCK_SHARED);
+        alist = (struct attrlist *)context.alist;
-        xfs_attr_trace_l_c("syscall start", &context);
+        alist->al_count = 0;
+        alist->al_more = 0;
+        alist->al_offset[0] = context.bufsize;
        error = xfs_attr_list_int(&context);
+        ASSERT(error >= 0);
-        xfs_iunlock(dp, XFS_ILOCK_SHARED);
-        xfs_attr_trace_l_c("syscall end", &context);
-        if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) {
-                /* must return negated buffer size or the error */
-                if (context.count < 0)
-                        error = XFS_ERROR(ERANGE);
-                else
-                        error = -context.count;
-        } else
-                ASSERT(error >= 0);
        return error;
 }
@@ -816,12 +783,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
        ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
        xfs_ilock(dp, XFS_ILOCK_SHARED);
-        if ((XFS_IFORK_Q(dp) == 0) ||
+        if (!xfs_inode_hasattr(dp) ||
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+            dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-             dp->i_d.di_anextents == 0)) {
                xfs_iunlock(dp, XFS_ILOCK_SHARED);
-                return(0);
+                return 0;
        }
        xfs_iunlock(dp, XFS_ILOCK_SHARED);
@@ -854,10 +819,8 @@ xfs_attr_inactive(xfs_inode_t *dp)
        /*
         * Decide on what work routines to call based on the inode size.
         */
-        if ((XFS_IFORK_Q(dp) == 0) ||
+        if (!xfs_inode_hasattr(dp) ||
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+            dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-             dp->i_d.di_anextents == 0)) {
                error = 0;
                goto out;
        }
@@ -974,7 +937,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                        xfs_da_brelse(args->trans, bp);
                        return(retval);
                }
-                args->rename = 1;                       /* an atomic rename */
+                args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
                args->rmtblkno2 = args->rmtblkno;
@@ -1019,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Commit the current trans (including the inode) and start
                 * a new one.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        return (error);
                /*
@@ -1033,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         * Commit the transaction that added the attr name so that
         * later routines can manage their own transactions.
         */
-        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
                return (error);
        /*
@@ -1054,7 +1019,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         * so that one disappears and one appears atomically.  Then we
         * must remove the "old" attribute/value pair.
         */
-        if (args->rename) {
+        if (args->op_flags & XFS_DA_OP_RENAME) {
                /*
                 * In a separate transaction, set the incomplete flag on the
                 * "old" attr and clear the incomplete flag on the "new" attr.
@@ -1122,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                /*
                 * Commit the remove and start the next trans in series.
                 */
-                error = xfs_attr_rolltrans(&args->trans, dp);
+                error = xfs_trans_roll(&args->trans, dp);
        } else if (args->rmtblkno > 0) {
                /*
@@ -1307,7 +1272,7 @@ restart:
        } else if (retval == EEXIST) {
                if (args->flags & ATTR_CREATE)
                        goto out;
-                args->rename = 1;                       /* atomic rename op */
+                args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
                args->rmtblkno2 = args->rmtblkno;
@@ -1353,7 +1318,8 @@ restart:
                         * Commit the node conversion and start the next
                         * trans in the chain.
                         */
-                        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        error = xfs_trans_roll(&args->trans, dp);
+                        if (error)
                                goto out;
                        goto restart;
@@ -1404,7 +1370,8 @@ restart:
         * Commit the leaf addition or btree split and start the next
         * trans in the chain.
         */
-        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+        error = xfs_trans_roll(&args->trans, dp);
+        if (error)
                goto out;
        /*
@@ -1425,7 +1392,7 @@ restart:
         * so that one disappears and one appears atomically.  Then we
         * must remove the "old" attribute/value pair.
         */
-        if (args->rename) {
+        if (args->op_flags & XFS_DA_OP_RENAME) {
                /*
                 * In a separate transaction, set the incomplete flag on the
                 * "old" attr and clear the incomplete flag on the "new" attr.
@@ -1504,7 +1471,8 @@ restart:
                /*
                 * Commit and start the next trans in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        goto out;
        } else if (args->rmtblkno > 0) {
@@ -1636,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                /*
                 * Commit the Btree join operation and start a new trans.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        goto out;
        }
@@ -2137,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                /*
                 * Start the next trans in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                error = xfs_trans_roll(&args->trans, dp);
+                if (error)
                        return (error);
        }
@@ -2287,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
                /*
                 * Close out trans and start the next one in the chain.
                 */
-                if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+                error = xfs_trans_roll(&args->trans, args->dp);
+                if (error)
                        return (error);
        }
        return(0);
@@ -2300,23 +2271,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 void
 xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
 {
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context,
-                (__psunsigned_t)context->dp,
-                (__psunsigned_t)context->cursor->hashval,
-                (__psunsigned_t)context->cursor->blkno,
-                (__psunsigned_t)context->cursor->offset,
-                (__psunsigned_t)context->alist,
-                (__psunsigned_t)context->bufsize,
-                (__psunsigned_t)context->count,
-                (__psunsigned_t)context->firstu,
-                (__psunsigned_t)
-                        ((context->count > 0) &&
-                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-                                ? (ATTR_ENTRY(context->alist,
-                                              context->count-1)->a_valuelen)
-                                : 0,
-                (__psunsigned_t)context->dupcnt,
-                (__psunsigned_t)context->flags,
                (__psunsigned_t)NULL,
                (__psunsigned_t)NULL,
                (__psunsigned_t)NULL);
@@ -2329,23 +2284,7 @@ void
 xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
                         struct xfs_da_intnode *node)
 {
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context,
-                (__psunsigned_t)context->dp,
-                (__psunsigned_t)context->cursor->hashval,
-                (__psunsigned_t)context->cursor->blkno,
-                (__psunsigned_t)context->cursor->offset,
-                (__psunsigned_t)context->alist,
-                (__psunsigned_t)context->bufsize,
-                (__psunsigned_t)context->count,
-                (__psunsigned_t)context->firstu,
-                (__psunsigned_t)
-                        ((context->count > 0) &&
-                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-                                ? (ATTR_ENTRY(context->alist,
-                                              context->count-1)->a_valuelen)
-                                : 0,
-                (__psunsigned_t)context->dupcnt,
-                (__psunsigned_t)context->flags,
                (__psunsigned_t)be16_to_cpu(node->hdr.count),
                (__psunsigned_t)be32_to_cpu(node->btree[0].hashval),
                (__psunsigned_t)be32_to_cpu(node->btree[
@@ -2359,23 +2298,7 @@ void
 xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
                          struct xfs_da_node_entry *btree)
 {
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context,
-                (__psunsigned_t)context->dp,
-                (__psunsigned_t)context->cursor->hashval,
-                (__psunsigned_t)context->cursor->blkno,
-                (__psunsigned_t)context->cursor->offset,
-                (__psunsigned_t)context->alist,
-                (__psunsigned_t)context->bufsize,
-                (__psunsigned_t)context->count,
-                (__psunsigned_t)context->firstu,
-                (__psunsigned_t)
-                        ((context->count > 0) &&
-                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-                                ? (ATTR_ENTRY(context->alist,
-                                              context->count-1)->a_valuelen)
-                                : 0,
-                (__psunsigned_t)context->dupcnt,
-                (__psunsigned_t)context->flags,
                (__psunsigned_t)be32_to_cpu(btree->hashval),
                (__psunsigned_t)be32_to_cpu(btree->before),
                (__psunsigned_t)NULL);
@@ -2388,23 +2311,7 @@ void
 xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
                              struct xfs_attr_leafblock *leaf)
 {
-        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context,
-                (__psunsigned_t)context->dp,
-                (__psunsigned_t)context->cursor->hashval,
-                (__psunsigned_t)context->cursor->blkno,
-                (__psunsigned_t)context->cursor->offset,
-                (__psunsigned_t)context->alist,
-                (__psunsigned_t)context->bufsize,
-                (__psunsigned_t)context->count,
-                (__psunsigned_t)context->firstu,
-                (__psunsigned_t)
-                        ((context->count > 0) &&
-                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-                                ? (ATTR_ENTRY(context->alist,
-                                              context->count-1)->a_valuelen)
-                                : 0,
-                (__psunsigned_t)context->dupcnt,
-                (__psunsigned_t)context->flags,
                (__psunsigned_t)be16_to_cpu(leaf->hdr.count),
                (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval),
                (__psunsigned_t)be32_to_cpu(leaf->entries[
@@ -2417,329 +2324,24 @@ xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
 */
 void
 xfs_attr_trace_enter(int type, char *where,
-                         __psunsigned_t a2, __psunsigned_t a3,
+                         struct xfs_attr_list_context *context,
-                         __psunsigned_t a4, __psunsigned_t a5,
+                         __psunsigned_t a13, __psunsigned_t a14,
-                         __psunsigned_t a6, __psunsigned_t a7,
+                         __psunsigned_t a15)
-                         __psunsigned_t a8, __psunsigned_t a9,
-                         __psunsigned_t a10, __psunsigned_t a11,
-                         __psunsigned_t a12, __psunsigned_t a13,
-                         __psunsigned_t a14, __psunsigned_t a15)
 {
        ASSERT(xfs_attr_trace_buf);
        ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
-                                         (void *)where,
+                (void *)((__psunsigned_t)where),
-                                         (void *)a2,  (void *)a3,  (void *)a4,
+                (void *)((__psunsigned_t)context->dp),
-                                         (void *)a5,  (void *)a6,  (void *)a7,
+                (void *)((__psunsigned_t)context->cursor->hashval),
-                                         (void *)a8,  (void *)a9,  (void *)a10,
+                (void *)((__psunsigned_t)context->cursor->blkno),
-                                         (void *)a11, (void *)a12, (void *)a13,
+                (void *)((__psunsigned_t)context->cursor->offset),
-                                         (void *)a14, (void *)a15);
+                (void *)((__psunsigned_t)context->alist),
+                (void *)((__psunsigned_t)context->bufsize),
+                (void *)((__psunsigned_t)context->count),
+                (void *)((__psunsigned_t)context->firstu),
+                NULL,
+                (void *)((__psunsigned_t)context->dupcnt),
+                (void *)((__psunsigned_t)context->flags),
+                (void *)a13, (void *)a14, (void *)a15);
 }
 #endif  /* XFS_ATTR_TRACE */
-/*========================================================================
- * System (pseudo) namespace attribute interface routines.
- *========================================================================*/
-STATIC int
-posix_acl_access_set(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
-}
-STATIC int
-posix_acl_access_remove(
-        bhv_vnode_t *vp, char *name, int xflags)
-{
-        return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-}
-STATIC int
-posix_acl_access_get(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
-}
-STATIC int
-posix_acl_access_exists(
-        bhv_vnode_t *vp)
-{
-        return xfs_acl_vhasacl_access(vp);
-}
-STATIC int
-posix_acl_default_set(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-STATIC int
-posix_acl_default_get(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-STATIC int
-posix_acl_default_remove(
-        bhv_vnode_t *vp, char *name, int xflags)
-{
-        return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
-}
-STATIC int
-posix_acl_default_exists(
-        bhv_vnode_t *vp)
-{
-        return xfs_acl_vhasacl_default(vp);
-}
-static struct attrnames posix_acl_access = {
-        .attr_name      = "posix_acl_access",
-        .attr_namelen   = sizeof("posix_acl_access") - 1,
-        .attr_get       = posix_acl_access_get,
-        .attr_set       = posix_acl_access_set,
-        .attr_remove    = posix_acl_access_remove,
-        .attr_exists    = posix_acl_access_exists,
-};
-static struct attrnames posix_acl_default = {
-        .attr_name      = "posix_acl_default",
-        .attr_namelen   = sizeof("posix_acl_default") - 1,
-        .attr_get       = posix_acl_default_get,
-        .attr_set       = posix_acl_default_set,
-        .attr_remove    = posix_acl_default_remove,
-        .attr_exists    = posix_acl_default_exists,
-};
-static struct attrnames *attr_system_names[] =
-        { &posix_acl_access, &posix_acl_default };
-/*========================================================================
- * Namespace-prefix-style attribute name interface routines.
- *========================================================================*/
-STATIC int
-attr_generic_set(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        return -xfs_attr_set(xfs_vtoi(vp), name, data, size, xflags);
-}
-STATIC int
-attr_generic_get(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        int     error, asize = size;
-        error = xfs_attr_get(xfs_vtoi(vp), name, data, &asize, xflags);
-        if (!error)
-                return asize;
-        return -error;
-}
-STATIC int
-attr_generic_remove(
-        bhv_vnode_t *vp, char *name, int xflags)
-{
-        return -xfs_attr_remove(xfs_vtoi(vp), name, xflags);
-}
-STATIC int
-attr_generic_listadd(
-        attrnames_t             *prefix,
-        attrnames_t             *namesp,
-        void                    *data,
-        size_t                  size,
-        ssize_t                 *result)
-{
-        char                    *p = data + *result;
-        *result += prefix->attr_namelen;
-        *result += namesp->attr_namelen + 1;
-        if (!size)
-                return 0;
-        if (*result > size)
-                return -ERANGE;
-        strcpy(p, prefix->attr_name);
-        p += prefix->attr_namelen;
-        strcpy(p, namesp->attr_name);
-        p += namesp->attr_namelen + 1;
-        return 0;
-}
-STATIC int
-attr_system_list(
-        bhv_vnode_t             *vp,
-        void                    *data,
-        size_t                  size,
-        ssize_t                 *result)
-{
-        attrnames_t             *namesp;
-        int                     i, error = 0;
-        for (i = 0; i < ATTR_SYSCOUNT; i++) {
-                namesp = attr_system_names[i];
-                if (!namesp->attr_exists || !namesp->attr_exists(vp))
-                        continue;
-                error = attr_generic_listadd(&attr_system, namesp,
-                                                data, size, result);
-                if (error)
-                        break;
-        }
-        return error;
-}
-int
-attr_generic_list(
-        bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result)
-{
-        attrlist_cursor_kern_t  cursor = { 0 };
-        int                     error;
-        error = xfs_attr_list(xfs_vtoi(vp), data, size, xflags, &cursor);
-        if (error > 0)
-                return -error;
-        *result = -error;
-        return attr_system_list(vp, data, size, result);
-}
-attrnames_t *
-attr_lookup_namespace(
-        char                    *name,
-        struct attrnames        **names,
-        int                     nnames)
-{
-        int                     i;
-        for (i = 0; i < nnames; i++)
-                if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen))
-                        return names[i];
-        return NULL;
-}
-/*
- * Some checks to prevent people abusing EAs to get over quota:
- * - Don't allow modifying user EAs on devices/symlinks;
- * - Don't allow modifying user EAs if sticky bit set;
- */
-STATIC int
-attr_user_capable(
-        bhv_vnode_t     *vp,
-        cred_t          *cred)
-{
-        struct inode    *inode = vn_to_inode(vp);
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                return -EPERM;
-        if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
-            !capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-            (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER))
-                return -EPERM;
-        return 0;
-}
-STATIC int
-attr_trusted_capable(
-        bhv_vnode_t     *vp,
-        cred_t          *cred)
-{
-        struct inode    *inode = vn_to_inode(vp);
-        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                return -EPERM;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        return 0;
-}
-STATIC int
-attr_system_set(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        attrnames_t     *namesp;
-        int             error;
-        if (xflags & ATTR_CREATE)
-                return -EINVAL;
-        namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
-        if (!namesp)
-                return -EOPNOTSUPP;
-        error = namesp->attr_set(vp, name, data, size, xflags);
-        if (!error)
-                error = vn_revalidate(vp);
-        return error;
-}
-STATIC int
-attr_system_get(
-        bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-        attrnames_t     *namesp;
-        namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
-        if (!namesp)
-                return -EOPNOTSUPP;
-        return namesp->attr_get(vp, name, data, size, xflags);
-}
-STATIC int
-attr_system_remove(
-        bhv_vnode_t *vp, char *name, int xflags)
-{
-        attrnames_t     *namesp;
-        namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
-        if (!namesp)
-                return -EOPNOTSUPP;
-        return namesp->attr_remove(vp, name, xflags);
-}
-struct attrnames attr_system = {
-        .attr_name      = "system.",
-        .attr_namelen   = sizeof("system.") - 1,
-        .attr_flag      = ATTR_SYSTEM,
-        .attr_get       = attr_system_get,
-        .attr_set       = attr_system_set,
-        .attr_remove    = attr_system_remove,
-        .attr_capable   = (attrcapable_t)fs_noerr,
-};
-struct attrnames attr_trusted = {
-        .attr_name      = "trusted.",
-        .attr_namelen   = sizeof("trusted.") - 1,
-        .attr_flag      = ATTR_ROOT,
-        .attr_get       = attr_generic_get,
-        .attr_set       = attr_generic_set,
-        .attr_remove    = attr_generic_remove,
-        .attr_capable   = attr_trusted_capable,
-};
-struct attrnames attr_secure = {
-        .attr_name      = "security.",
-        .attr_namelen   = sizeof("security.") - 1,
-        .attr_flag      = ATTR_SECURE,
-        .attr_get       = attr_generic_get,
-        .attr_set       = attr_generic_set,
-        .attr_remove    = attr_generic_remove,
-        .attr_capable   = (attrcapable_t)fs_noerr,
-};
-struct attrnames attr_user = {
-        .attr_name      = "user.",
-        .attr_namelen   = sizeof("user.") - 1,
-        .attr_get       = attr_generic_get,
-        .attr_set       = attr_generic_set,
-        .attr_remove    = attr_generic_remove,
-        .attr_capable   = attr_user_capable,
-};
-struct attrnames *attr_namespaces[] =
-        { &attr_system, &attr_trusted, &attr_secure, &attr_user };
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 6cfc9384fe35..fb3b2a68b9b9 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -18,9 +18,11 @@
 #ifndef __XFS_ATTR_H__
 #define __XFS_ATTR_H__
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
 /*
- * xfs_attr.h
- *
 * Large attribute lists are structured around Btrees where all the data
 * elements are in the leaf nodes.  Attribute names are hashed into an int,
 * then that int is used as the index into the Btree.  Since the hashval
@@ -35,35 +37,6 @@
 * External interfaces
 *========================================================================*/
-struct cred;
-struct xfs_attr_list_context;
-typedef int (*attrset_t)(bhv_vnode_t *, char *, void *, size_t, int);
-typedef int (*attrget_t)(bhv_vnode_t *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(bhv_vnode_t *, char *, int);
-typedef int (*attrexists_t)(bhv_vnode_t *);
-typedef int (*attrcapable_t)(bhv_vnode_t *, struct cred *);
-typedef struct attrnames {
-        char *          attr_name;
-        unsigned int    attr_namelen;
-        unsigned int    attr_flag;
-        attrget_t       attr_get;
-        attrset_t       attr_set;
-        attrremove_t    attr_remove;
-        attrexists_t    attr_exists;
-        attrcapable_t   attr_capable;
-} attrnames_t;
-#define ATTR_NAMECOUNT  4
-extern struct attrnames attr_user;
-extern struct attrnames attr_secure;
-extern struct attrnames attr_system;
-extern struct attrnames attr_trusted;
-extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
-extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *);
 #define ATTR_DONTFOLLOW 0x0001  /* -- unused, from IRIX -- */
 #define ATTR_ROOT       0x0002  /* use attrs in root (trusted) namespace */
@@ -71,16 +44,9 @@ extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *);
 #define ATTR_SECURE     0x0008  /* use attrs in security namespace */
 #define ATTR_CREATE     0x0010  /* pure create: fail if attr already exists */
 #define ATTR_REPLACE    0x0020  /* pure set: fail if attr does not exist */
-#define ATTR_SYSTEM     0x0100  /* use attrs in system (pseudo) namespace */
-#define ATTR_KERNACCESS 0x0400  /* [kernel] iaccess, inode held io-locked */
 #define ATTR_KERNOTIME  0x1000  /* [kernel] don't update inode timestamps */
 #define ATTR_KERNOVAL   0x2000  /* [kernel] get attr size only, not value */
-#define ATTR_KERNAMELS  0x4000  /* [kernel] list attr names (simple list) */
-#define ATTR_KERNORMALS 0x0800  /* [kernel] normal attr list: user+secure */
-#define ATTR_KERNROOTLS 0x8000  /* [kernel] include root in the attr list */
-#define ATTR_KERNFULLS  (ATTR_KERNORMALS|ATTR_KERNROOTLS)
 /*
 * The maximum size (into the kernel or returned from the kernel) of an
@@ -119,22 +85,6 @@ typedef struct attrlist_ent {	/* data from attr_list() */
         &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
 /*
- * Multi-attribute operation vector.
- */
-typedef struct attr_multiop {
-        int     am_opcode;      /* operation to perform (ATTR_OP_GET, etc.) */
-        int     am_error;       /* [out arg] result of this sub-op (an errno) */
-        char    *am_attrname;   /* attribute name to work with */
-        char    *am_attrvalue;  /* [in/out arg] attribute value (raw bytes) */
-        int     am_length;      /* [in/out arg] length of value */
-        int     am_flags;       /* bitwise OR of attr API flags defined above */
-} attr_multiop_t;
-#define ATTR_OP_GET     1       /* return the indicated attr's value */
-#define ATTR_OP_SET     2       /* set/create the indicated attr/value pair */
-#define ATTR_OP_REMOVE  3       /* remove the indicated attr */
-/*
 * Kernel-internal version of the attrlist cursor.
 */
 typedef struct attrlist_cursor_kern {
@@ -148,20 +98,41 @@ typedef struct attrlist_cursor_kern {
 /*========================================================================
- * Function prototypes for the kernel.
+ * Structure used to pass context around among the routines.
 *========================================================================*/
-struct xfs_inode;
-struct attrlist_cursor_kern;
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-struct xfs_da_args;
+                                      char *, int, int, char *);
+typedef struct xfs_attr_list_context {
+        struct xfs_inode                *dp;            /* inode */
+        struct attrlist_cursor_kern     *cursor;        /* position in list */
+        char                            *alist;         /* output buffer */
+        int                             seen_enough;    /* T/F: seen enough of list? */
+        ssize_t                         count;          /* num used entries */
+        int                             dupcnt;         /* count dup hashvals seen */
+        int                             bufsize;        /* total buffer size */
+        int                             firstu;         /* first used byte in buffer */
+        int                             flags;          /* from VOP call */
+        int                             resynch;        /* T/F: resynch with cursor */
+        int                             put_value;      /* T/F: need value for listent */
+        put_listent_func_t              put_listent;    /* list output fmt function */
+        int                             index;          /* index into output buffer */
+} xfs_attr_list_context_t;
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
 /*
 * Overall external interface routines.
 */
+int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_shortform_getvalue(struct xfs_da_args *);
 int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
 #endif  /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 303d41e4217b..79da6b2ea99e 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -94,13 +94,6 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
 * Namespace helper routines
 *========================================================================*/
-STATIC_INLINE attrnames_t *
-xfs_attr_flags_namesp(int flags)
-{
-        return ((flags & XFS_ATTR_SECURE) ? &attr_secure:
-                  ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user));
-}
 /*
 * If namespace bits don't match return 0.
 * If all match then return 1.
@@ -111,25 +104,6 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
        return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
 }
-/*
- * If namespace bits don't match and we don't have an override for it
- * then return 0.
- * If all match or are overridable then return 1.
- */
-STATIC_INLINE int
-xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags)
-{
-        if (((arg_flags & ATTR_SECURE) == 0) !=
-            ((ondisk_flags & XFS_ATTR_SECURE) == 0) &&
-            !(arg_flags & ATTR_KERNORMALS))
-                return 0;
-        if (((arg_flags & ATTR_ROOT) == 0) !=
-            ((ondisk_flags & XFS_ATTR_ROOT) == 0) &&
-            !(arg_flags & ATTR_KERNROOTLS))
-                return 0;
-        return 1;
-}
 /*========================================================================
 * External routines when attribute fork size < XFS_LITINO(mp).
@@ -369,9 +343,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
         * Fix up the start offset of the attribute fork
         */
        totsize -= size;
-        if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname &&
+        if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
-            (mp->m_flags & XFS_MOUNT_ATTR2) && 
+                                !(args->op_flags & XFS_DA_OP_ADDNAME) &&
-            (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
+                                (mp->m_flags & XFS_MOUNT_ATTR2) &&
+                                (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
                /*
                 * Last attribute now removed, revert to original
                 * inode format making all literal area available
@@ -389,9 +364,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
                xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
                dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
                ASSERT(dp->i_d.di_forkoff);
-                ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname ||
+                ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
-                        !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+                                (args->op_flags & XFS_DA_OP_ADDNAME) ||
-                        dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+                                !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+                                dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
                dp->i_afp->if_ext_max =
                        XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
                dp->i_df.if_ext_max =
@@ -531,7 +507,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
        nargs.total = args->total;
        nargs.whichfork = XFS_ATTR_FORK;
        nargs.trans = args->trans;
-        nargs.oknoent = 1;
+        nargs.op_flags = XFS_DA_OP_OKNOENT;
        sfe = &sf->list[0];
        for (i = 0; i < sf->hdr.count; i++) {
@@ -555,7 +531,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 out:
        if(bp)
                xfs_da_buf_done(bp);
-        kmem_free(tmpbuffer, size);
+        kmem_free(tmpbuffer);
        return(error);
 }
@@ -624,15 +600,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
            (XFS_ISRESET_CURSOR(cursor) &&
             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
                for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-                        attrnames_t     *namesp;
-                        if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
-                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-                                continue;
-                        }
-                        namesp = xfs_attr_flags_namesp(sfe->flags);
                        error = context->put_listent(context,
-                                           namesp,
+                                           sfe->flags,
                                           (char *)sfe->nameval,
                                           (int)sfe->namelen,
                                           (int)sfe->valuelen,
@@ -676,13 +645,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                             XFS_ERRLEVEL_LOW,
                                             context->dp->i_mount, sfe);
                        xfs_attr_trace_l_c("sf corrupted", context);
-                        kmem_free(sbuf, sbsize);
+                        kmem_free(sbuf);
                        return XFS_ERROR(EFSCORRUPTED);
                }
-                if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
-                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-                        continue;
-                }
                sbp->entno = i;
                sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen);
                sbp->name = (char *)sfe->nameval;
@@ -717,7 +683,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                }
        }
        if (i == nsbuf) {
-                kmem_free(sbuf, sbsize);
+                kmem_free(sbuf);
                xfs_attr_trace_l_c("blk end", context);
                return(0);
        }
@@ -726,16 +692,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
         * Loop putting entries into the user buffer.
         */
        for ( ; i < nsbuf; i++, sbp++) {
-                attrnames_t     *namesp;
-                namesp = xfs_attr_flags_namesp(sbp->flags);
                if (cursor->hashval != sbp->hash) {
                        cursor->hashval = sbp->hash;
                        cursor->offset = 0;
                }
                error = context->put_listent(context,
-                                        namesp,
+                                        sbp->flags,
                                        sbp->name,
                                        sbp->namelen,
                                        sbp->valuelen,
@@ -747,7 +709,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                cursor->offset++;
        }
-        kmem_free(sbuf, sbsize);
+        kmem_free(sbuf);
        xfs_attr_trace_l_c("sf E-O-F", context);
        return(0);
 }
@@ -853,7 +815,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
        nargs.total = args->total;
        nargs.whichfork = XFS_ATTR_FORK;
        nargs.trans = args->trans;
-        nargs.oknoent = 1;
+        nargs.op_flags = XFS_DA_OP_OKNOENT;
        entry = &leaf->entries[0];
        for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
                if (entry->flags & XFS_ATTR_INCOMPLETE)
@@ -873,7 +835,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
        error = 0;
 out:
-        kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+        kmem_free(tmpbuffer);
        return(error);
 }
@@ -1155,7 +1117,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
        entry->hashval = cpu_to_be32(args->hashval);
        entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
        entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-        if (args->rename) {
+        if (args->op_flags & XFS_DA_OP_RENAME) {
                entry->flags |= XFS_ATTR_INCOMPLETE;
                if ((args->blkno2 == args->blkno) &&
                    (args->index2 <= args->index)) {
@@ -1271,7 +1233,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
                                be16_to_cpu(hdr_s->count), mp);
        xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
-        kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+        kmem_free(tmpbuffer);
 }
 /*
@@ -1921,7 +1883,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                                be16_to_cpu(drop_hdr->count), mp);
                }
                memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
-                kmem_free(tmpbuffer, state->blocksize);
+                kmem_free(tmpbuffer);
        }
        xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
@@ -2400,8 +2362,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
         */
        retval = 0;
        for (  ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
-                attrnames_t *namesp;
                if (be32_to_cpu(entry->hashval) != cursor->hashval) {
                        cursor->hashval = be32_to_cpu(entry->hashval);
                        cursor->offset = 0;
@@ -2409,17 +2369,13 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                if (entry->flags & XFS_ATTR_INCOMPLETE)
                        continue;               /* skip incomplete entries */
-                if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags))
-                        continue;
-                namesp = xfs_attr_flags_namesp(entry->flags);
                if (entry->flags & XFS_ATTR_LOCAL) {
                        xfs_attr_leaf_name_local_t *name_loc =
                                XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
                        retval = context->put_listent(context,
-                                                namesp,
+                                                entry->flags,
                                                (char *)name_loc->nameval,
                                                (int)name_loc->namelen,
                                                be16_to_cpu(name_loc->valuelen),
@@ -2446,16 +2402,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                                if (retval)
                                        return retval;
                                retval = context->put_listent(context,
-                                                namesp,
+                                                entry->flags,
                                                (char *)name_rmt->name,
                                                (int)name_rmt->namelen,
                                                valuelen,
                                                (char*)args.value);
-                                kmem_free(args.value, valuelen);
+                                kmem_free(args.value);
-                        }
+                        } else {
-                        else {
                                retval = context->put_listent(context,
-                                                namesp,
+                                                entry->flags,
                                                (char *)name_rmt->name,
                                                (int)name_rmt->namelen,
                                                valuelen,
@@ -2543,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return xfs_trans_roll(&args->trans, args->dp);
-        return(error);
 }
 /*
@@ -2592,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return xfs_trans_roll(&args->trans, args->dp);
-        return(error);
 }
 /*
@@ -2710,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        /*
         * Commit the flag value change and start the next trans in series.
         */
-        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        error = xfs_trans_roll(&args->trans, args->dp);
        return(error);
 }
@@ -2768,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
        /*
         * Commit the invalidate and start the next transaction.
         */
-        error = xfs_attr_rolltrans(trans, dp);
+        error = xfs_trans_roll(trans, dp);
        return (error);
 }
@@ -2870,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
                /*
                 * Atomically commit the whole invalidate stuff.
                 */
-                if ((error = xfs_attr_rolltrans(trans, dp)))
+                error = xfs_trans_roll(trans, dp);
+                if (error)
                        return (error);
        }
@@ -2954,7 +2906,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
                        error = tmp;    /* save only the 1st errno */
        }
-        kmem_free((xfs_caddr_t)list, size);
+        kmem_free((xfs_caddr_t)list);
        return(error);
 }
@@ -3009,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
                        /*
                         * Roll to next transaction.
                         */
-                        if ((error = xfs_attr_rolltrans(trans, dp)))
+                        error = xfs_trans_roll(trans, dp);
+                        if (error)
                                return (error);
                }
@@ -3019,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
        return(0);
 }
-/*
- * Roll from one trans in the sequence of PERMANENT transactions to the next.
- */
-int
-xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
-{
-        xfs_trans_t *trans;
-        unsigned int logres, count;
-        int     error;
-        /*
-         * Ensure that the inode is always logged.
-         */
-        trans = *transp;
-        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-        /*
-         * Copy the critical parameters from one trans to the next.
-         */
-        logres = trans->t_log_res;
-        count = trans->t_log_count;
-        *transp = xfs_trans_dup(trans);
-        /*
-         * Commit the current transaction.
-         * If this commit failed, then it'd just unlock those items that
-         * are not marked ihold. That also means that a filesystem shutdown
-         * is in progress. The caller takes the responsibility to cancel
-         * the duplicate transaction that gets returned.
-         */
-        if ((error = xfs_trans_commit(trans, 0)))
-                return (error);
-        trans = *transp;
-        /*
-         * Reserve space in the log for th next transaction.
-         * This also pushes items in the "AIL", the list of logged items,
-         * out to disk if they are taking up space at the tail of the log
-         * that we want to use.  This requires that either nothing be locked
-         * across this call, or that anything that is locked be logged in
-         * the prior and the next transactions.
-         */
-        error = xfs_trans_reserve(trans, 0, logres, 0,
-                                  XFS_TRANS_PERM_LOG_RES, count);
-        /*
-         *  Ensure that the inode is in the new transaction and locked.
-         */
-        if (!error) {
-                xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(trans, dp);
-        }
-        return (error);
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 040f732ce1e2..83e9af417ca2 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -30,7 +30,7 @@
 struct attrlist;
 struct attrlist_cursor_kern;
-struct attrnames;
+struct xfs_attr_list_context;
 struct xfs_dabuf;
 struct xfs_da_args;
 struct xfs_da_state;
@@ -204,33 +204,6 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize)
        return (((bsize) >> 1) + ((bsize) >> 2));
 }
-/*========================================================================
- * Structure used to pass context around among the routines.
- *========================================================================*/
-struct xfs_attr_list_context;
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *,
-                                      char *, int, int, char *);
-typedef struct xfs_attr_list_context {
-        struct xfs_inode                *dp;            /* inode */
-        struct attrlist_cursor_kern     *cursor;        /* position in list */
-        struct attrlist                 *alist;         /* output buffer */
-        int                             seen_enough;    /* T/F: seen enough of list? */
-        int                             count;          /* num used entries */
-        int                             dupcnt;         /* count dup hashvals seen */
-        int                             bufsize;        /* total buffer size */
-        int                             firstu;         /* first used byte in buffer */
-        int                             flags;          /* from VOP call */
-        int                             resynch;        /* T/F: resynch with cursor */
-        int                             put_value;      /* T/F: need value for listent */
-        put_listent_func_t              put_listent;    /* list output fmt function */
-        int                             index;          /* index into output buffer */
-} xfs_attr_list_context_t;
 /*
 * Used to keep a list of "remote value" extents when unlinking an inode.
 */
@@ -301,6 +274,4 @@ int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
                                   struct xfs_dabuf *leaf2_bp);
 int     xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
                                        int *local);
-int     xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index f67f917803b1..ea22839caed2 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -97,13 +97,9 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
 void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
                              struct xfs_attr_leafblock *leaf);
 void xfs_attr_trace_enter(int type, char *where,
-                             __psunsigned_t a2, __psunsigned_t a3,
+                             struct xfs_attr_list_context *context,
-                             __psunsigned_t a4, __psunsigned_t a5,
+                             __psunsigned_t a13, __psunsigned_t a14,
-                             __psunsigned_t a6, __psunsigned_t a7,
+                             __psunsigned_t a15);
-                             __psunsigned_t a8, __psunsigned_t a9,
-                             __psunsigned_t a10, __psunsigned_t a11,
-                             __psunsigned_t a12, __psunsigned_t a13,
-                             __psunsigned_t a14, __psunsigned_t a15);
 #else
 #define xfs_attr_trace_l_c(w,c)
 #define xfs_attr_trace_l_cn(w,c,n)
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index fab0b6d5a41b..48228848f5ae 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,109 +25,6 @@
 * XFS bit manipulation routines, used in non-realtime code.
 */
-#ifndef HAVE_ARCH_HIGHBIT
-/*
- * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
- */
-static const char xfs_highbit[256] = {
-       -1, 0, 1, 1, 2, 2, 2, 2,                 /* 00 .. 07 */
-        3, 3, 3, 3, 3, 3, 3, 3,                 /* 08 .. 0f */
-        4, 4, 4, 4, 4, 4, 4, 4,                 /* 10 .. 17 */
-        4, 4, 4, 4, 4, 4, 4, 4,                 /* 18 .. 1f */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 20 .. 27 */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 28 .. 2f */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 30 .. 37 */
-        5, 5, 5, 5, 5, 5, 5, 5,                 /* 38 .. 3f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 40 .. 47 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 48 .. 4f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 50 .. 57 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 58 .. 5f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 60 .. 67 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 68 .. 6f */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 70 .. 77 */
-        6, 6, 6, 6, 6, 6, 6, 6,                 /* 78 .. 7f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 80 .. 87 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 88 .. 8f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 90 .. 97 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* 98 .. 9f */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* a0 .. a7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* a8 .. af */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* b0 .. b7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* b8 .. bf */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* c0 .. c7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* c8 .. cf */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* d0 .. d7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* d8 .. df */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* e0 .. e7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* e8 .. ef */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* f0 .. f7 */
-        7, 7, 7, 7, 7, 7, 7, 7,                 /* f8 .. ff */
-};
-#endif
-/*
- * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
- */
-inline int
-xfs_highbit32(
-        __uint32_t      v)
-{
-#ifdef HAVE_ARCH_HIGHBIT
-        return highbit32(v);
-#else
-        int             i;
-        if (v & 0xffff0000)
-                if (v & 0xff000000)
-                        i = 24;
-                else
-                        i = 16;
-        else if (v & 0x0000ffff)
-                if (v & 0x0000ff00)
-                        i = 8;
-                else
-                        i = 0;
-        else
-                return -1;
-        return i + xfs_highbit[(v >> i) & 0xff];
-#endif
-}
-/*
- * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_lowbit64(
-        __uint64_t      v)
-{
-        __uint32_t      w = (__uint32_t)v;
-        int             n = 0;
-        if (w) {        /* lower bits */
-                n = ffs(w);
-        } else {        /* upper bits */
-                w = (__uint32_t)(v >> 32);
-                if (w && (n = ffs(w)))
-                        n += 32;
-        }
-        return n - 1;
-}
-/*
- * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_highbit64(
-        __uint64_t      v)
-{
-        __uint32_t      h = (__uint32_t)(v >> 32);
-        if (h)
-                return xfs_highbit32(h) + 32;
-        return xfs_highbit32((__uint32_t)v);
-}
 /*
 * Return whether bitmap is empty.
 * Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 082641a9782c..8e0e463dae2d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n)
 }
 /* Get high bit set out of 32-bit argument, -1 if none set */
-extern int xfs_highbit32(__uint32_t v);
+static inline int xfs_highbit32(__uint32_t v)
+{
+        return fls(v) - 1;
+}
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+        return fls64(v) - 1;
+}
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+        unsigned long   t = v;
+        return (v) ? find_first_bit(&t, 32) : -1;
+}
 /* Get low bit set out of 64-bit argument, -1 if none set */
-extern int xfs_lowbit64(__uint64_t v);
+static inline int xfs_lowbit64(__uint64_t v)
+{
+        __uint32_t      w = (__uint32_t)v;
+        int             n = 0;
-/* Get high bit set out of 64-bit argument, -1 if none set */
+        if (w) {        /* lower bits */
-extern int xfs_highbit64(__uint64_t);
+                n = ffs(w);
+        } else {        /* upper bits */
+                w = (__uint32_t)(v >> 32);
+                if (w && (n = ffs(w)))
+                n += 32;
+        }
+        return n - 1;
+}
 /* Return whether bitmap is empty (1 == empty) */
 extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 53c259f5a5af..a1aab9275d5a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -384,14 +384,14 @@ xfs_bmap_count_tree(
        int             levelin,
        int             *count);
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
        xfs_ifork_t             *ifp,
        xfs_extnum_t            idx,
        int                     numrecs,
        int                     *count);
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
        xfs_extnum_t            idx,
        xfs_bmbt_block_t        *block,
@@ -428,7 +428,8 @@ xfs_bmap_add_attrfork_btree(
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
-                ASSERT(stat == 1);      /* must be at least one entry */
+                /* must be at least one entry */
+                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
                if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
@@ -816,13 +817,13 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_delete(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -860,7 +861,7 @@ xfs_bmap_add_extent_delay_real(
                                        LEFT.br_startblock, LEFT.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -895,7 +896,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                        new->br_startblock,
                                        PREV.br_blockcount +
@@ -928,11 +929,11 @@ xfs_bmap_add_extent_delay_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 0);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                *dnew = 0;
                /* DELTA: The in-core extent described by new changed type. */
@@ -963,7 +964,7 @@ xfs_bmap_add_extent_delay_real(
                                        LEFT.br_startblock, LEFT.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -1004,11 +1005,11 @@ xfs_bmap_add_extent_delay_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 0);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1054,7 +1055,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
@@ -1094,11 +1095,11 @@ xfs_bmap_add_extent_delay_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 0);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1149,11 +1150,11 @@ xfs_bmap_add_extent_delay_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 0);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1377,19 +1378,19 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_delete(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_delete(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + PREV.br_blockcount +
@@ -1426,13 +1427,13 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_delete(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + PREV.br_blockcount,
@@ -1469,13 +1470,13 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_delete(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
                                new->br_blockcount + RIGHT.br_blockcount,
@@ -1508,7 +1509,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock, new->br_blockcount,
                                newext)))
@@ -1549,7 +1550,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur,
                                PREV.br_startoff + new->br_blockcount,
                                PREV.br_startblock + new->br_blockcount,
@@ -1596,7 +1597,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur,
                                PREV.br_startoff + new->br_blockcount,
                                PREV.br_startblock + new->br_blockcount,
@@ -1606,7 +1607,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = *new;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                /* DELTA: One in-core extent is split in two. */
                temp = PREV.br_startoff;
@@ -1640,7 +1641,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock,
                                        PREV.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                PREV.br_startblock,
                                PREV.br_blockcount - new->br_blockcount,
@@ -1682,7 +1683,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                PREV.br_startblock,
                                PREV.br_blockcount - new->br_blockcount,
@@ -1692,11 +1693,11 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 0);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                /* DELTA: One in-core extent is split in two. */
                temp = PREV.br_startoff;
@@ -1732,27 +1733,34 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        /* new right extent - oldext */
                        if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
                                r[1].br_startblock, r[1].br_blockcount,
                                r[1].br_state)))
                                goto done;
                        /* new left extent - oldext */
-                        PREV.br_blockcount =
-                                new->br_startoff - PREV.br_startoff;
                        cur->bc_rec.b = PREV;
+                        cur->bc_rec.b.br_blockcount =
+                                new->br_startoff - PREV.br_startoff;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                        /*
+                         * Reset the cursor to the position of the new extent
+                         * we are about to insert as we can't trust it after
+                         * the previous insert.
+                         */
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        /* new middle extent - newext */
-                        cur->bc_rec.b = *new;
+                        cur->bc_rec.b.br_state = new->br_state;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                /* DELTA: One in-core extent is split in three. */
                temp = PREV.br_startoff;
@@ -2097,13 +2105,13 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_startblock,
                                        right.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_delete(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
@@ -2139,7 +2147,7 @@ xfs_bmap_add_extent_hole_real(
                                        left.br_startblock,
                                        left.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
@@ -2174,7 +2182,7 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_startblock,
                                        right.br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
@@ -2208,11 +2216,11 @@ xfs_bmap_add_extent_hole_real(
                                        new->br_startblock,
                                        new->br_blockcount, &i)))
                                goto done;
-                        ASSERT(i == 0);
+                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = new->br_state;
                        if ((error = xfs_bmbt_insert(cur, &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                /* DELTA: A new extent was added in a hole. */
                temp = new->br_startoff;
@@ -3131,7 +3139,7 @@ xfs_bmap_del_extent(
                                        got.br_startblock, got.br_blockcount,
                                        &i)))
                                goto done;
-                        ASSERT(i == 1);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
                da_old = da_new = 0;
        } else {
@@ -3164,7 +3172,7 @@ xfs_bmap_del_extent(
                }
                if ((error = xfs_bmbt_delete(cur, &i)))
                        goto done;
-                ASSERT(i == 1);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                break;
        case 2:
@@ -3268,7 +3276,7 @@ xfs_bmap_del_extent(
                                                        got.br_startblock,
                                                        temp, &i)))
                                                goto done;
-                                        ASSERT(i == 1);
+                                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                                        /*
                                         * Update the btree record back
                                         * to the original value.
@@ -3289,7 +3297,7 @@ xfs_bmap_del_extent(
                                        error = XFS_ERROR(ENOSPC);
                                        goto done;
                                }
-                                ASSERT(i == 1);
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        } else
                                flags |= XFS_ILOG_FEXT(whichfork);
                        XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -3992,7 +4000,7 @@ xfs_bmap_add_attrfork(
                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        }
        ASSERT(ip->i_d.di_anextents == 0);
-        VN_HOLD(XFS_ITOV(ip));
+        IHOLD(ip);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        switch (ip->i_d.di_format) {
@@ -5970,7 +5978,7 @@ unlock_and_return:
        xfs_iunlock_map_shared(ip, lock);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        kmem_free(map, subnex * sizeof(*map));
+        kmem_free(map);
        return error;
 }
@@ -6088,7 +6096,7 @@ xfs_bmap_get_bp(
                tp = cur->bc_tp;
                licp = &tp->t_items;
                while (!bp && licp != NULL) {
-                        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                        if (xfs_lic_are_all_free(licp)) {
                                licp = licp->lic_next;
                                continue;
                        }
@@ -6098,11 +6106,11 @@ xfs_bmap_get_bp(
                                xfs_buf_log_item_t      *bip;
                                xfs_buf_t               *lbp;
-                                if (XFS_LIC_ISFREE(licp, i)) {
+                                if (xfs_lic_isfree(licp, i)) {
                                        continue;
                                }
-                                lidp = XFS_LIC_SLOT(licp, i);
+                                lidp = xfs_lic_slot(licp, i);
                                lip = lidp->lid_item;
                                if (lip->li_type != XFS_LI_BUF)
                                        continue;
@@ -6359,13 +6367,9 @@ xfs_bmap_count_blocks(
        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-                if (unlikely(xfs_bmap_count_leaves(ifp, 0,
+                xfs_bmap_count_leaves(ifp, 0,
                        ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-                        count) < 0)) {
+                        count);
-                        XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
-                                         XFS_ERRLEVEL_LOW, mp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
                return 0;
        }
@@ -6446,13 +6450,7 @@ xfs_bmap_count_tree(
                for (;;) {
                        nextbno = be64_to_cpu(block->bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                        if (unlikely(xfs_bmap_disk_count_leaves(0,
+                        xfs_bmap_disk_count_leaves(0, block, numrecs, count);
-                                        block, numrecs, count) < 0)) {
-                                xfs_trans_brelse(tp, bp);
-                                XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                return XFS_ERROR(EFSCORRUPTED);
-                        }
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6470,7 +6468,7 @@ xfs_bmap_count_tree(
 /*
 * Count leaf blocks given a range of extent records.
 */
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
        xfs_ifork_t             *ifp,
        xfs_extnum_t            idx,
@@ -6483,14 +6481,13 @@ xfs_bmap_count_leaves(
                xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
                *count += xfs_bmbt_get_blockcount(frp);
        }
-        return 0;
 }
 /*
 * Count leaf blocks given a range of extent records originally
 * in btree format.
 */
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
        xfs_extnum_t            idx,
        xfs_bmbt_block_t        *block,
@@ -6504,5 +6501,4 @@ xfs_bmap_disk_count_leaves(
                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 6ff70cda451c..9f3e3a836d15 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -54,12 +54,23 @@ typedef struct xfs_bmap_free_item
 /*
 * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
 */
 typedef struct xfs_bmap_free
 {
        xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
        int                     xbf_count;      /* count of items on list */
-        int                     xbf_low;        /* kludge: alloc in low mode */
+        int                     xbf_low;        /* alloc in low mode */
 } xfs_bmap_free_t;
 #define XFS_BMAP_MAX_NMAP       4
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 4f0e849d973e..23efad29a5cd 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -1493,12 +1493,27 @@ xfs_bmbt_split(
        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
        args.fsbno = cur->bc_private.b.firstblock;
        args.firstblock = args.fsbno;
+        args.minleft = 0;
        if (args.fsbno == NULLFSBLOCK) {
                args.fsbno = lbno;
                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else
+                /*
+                 * Make sure there is sufficient room left in the AG to
+                 * complete a full tree split for an extent insert.  If
+                 * we are converting the middle part of an extent then
+                 * we may need space for two tree splits.
+                 *
+                 * We are relying on the caller to make the correct block
+                 * reservation for this operation to succeed.  If the
+                 * reservation amount is insufficient then we may fail a
+                 * block allocation here and corrupt the filesystem.
+                 */
+                args.minleft = xfs_trans_get_block_res(args.tp);
+        } else if (cur->bc_private.b.flist->xbf_low)
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        else
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        args.mod = args.minleft = args.alignment = args.total = args.isfl =
+        args.mod = args.alignment = args.total = args.isfl =
                args.userdata = args.minalignslop = 0;
        args.minlen = args.maxlen = args.prod = 1;
        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
@@ -1510,6 +1525,21 @@ xfs_bmbt_split(
                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                return error;
        }
+        if (args.fsbno == NULLFSBLOCK && args.minleft) {
+                /*
+                 * Could not find an AG with enough free space to satisfy
+                 * a full btree split.  Try again without minleft and if
+                 * successful activate the lowspace algorithm.
+                 */
+                args.fsbno = 0;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.minleft = 0;
+                if ((error = xfs_alloc_vextent(&args))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                cur->bc_private.b.flist->xbf_low = 1;
+        }
        if (args.fsbno == NULLFSBLOCK) {
                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
                *stat = 0;
@@ -2029,22 +2059,8 @@ xfs_bmbt_increment(
 * Insert the current record at the point referenced by cur.
 *
 * A multi-level split of the tree on insert will invalidate the original
- * cursor. It appears, however, that some callers assume that the cursor is
+ * cursor.  All callers of this function should assume that the cursor is
- * always valid. Hence if we do a multi-level split we need to revalidate the
+ * no longer valid and revalidate it.
- * cursor.
- *
- * When a split occurs, we will see a new cursor returned. Use that as a
- * trigger to determine if we need to revalidate the original cursor. If we get
- * a split, then use the original irec to lookup up the path of the record we
- * just inserted.
- *
- * Note that the fact that the btree root is in the inode means that we can
- * have the level of the tree change without a "split" occurring at the root
- * level. What happens is that the root is migrated to an allocated block and
- * the inode root is pointed to it. This means a single split can change the
- * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
- * the level change should be accounted as a split so as to correctly trigger a
- * revalidation of the old cursor.
 */
 int                                     /* error */
 xfs_bmbt_insert(
@@ -2057,14 +2073,11 @@ xfs_bmbt_insert(
        xfs_fsblock_t   nbno;
        xfs_btree_cur_t *ncur;
        xfs_bmbt_rec_t  nrec;
-        xfs_bmbt_irec_t oirec;          /* original irec */
        xfs_btree_cur_t *pcur;
-        int             splits = 0;
        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
        level = 0;
        nbno = NULLFSBLOCK;
-        oirec = cur->bc_rec.b;
        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
        ncur = NULL;
        pcur = cur;
@@ -2073,13 +2086,11 @@ xfs_bmbt_insert(
                                &i))) {
                        if (pcur != cur)
                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        goto error0;
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
                }
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-                        /* allocating a new root is effectively a split */
-                        if (cur->bc_nlevels != pcur->bc_nlevels)
-                                splits++;
                        cur->bc_nlevels = pcur->bc_nlevels;
                        cur->bc_private.b.allocated +=
                                pcur->bc_private.b.allocated;
@@ -2093,21 +2104,10 @@ xfs_bmbt_insert(
                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
                }
                if (ncur) {
-                        splits++;
                        pcur = ncur;
                        ncur = NULL;
                }
        } while (nbno != NULLFSBLOCK);
-        if (splits > 1) {
-                /* revalidate the old cursor as we had a multi-level split */
-                error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
-                                oirec.br_startblock, oirec.br_blockcount, &i);
-                if (error)
-                        goto error0;
-                ASSERT(i == 1);
-        }
        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
        *stat = i;
        return 0;
@@ -2254,7 +2254,9 @@ xfs_bmbt_newroot(
 #endif
                args.fsbno = be64_to_cpu(*pp);
                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else
+        } else if (cur->bc_private.b.flist->xbf_low)
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        else
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
        if ((error = xfs_alloc_vextent(&args))) {
                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index aeb87ca69fcc..cc593a84c345 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -46,38 +46,11 @@ kmem_zone_t	*xfs_btree_cur_zone;
 /*
 * Btree magic numbers.
 */
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
-{
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
 /*
- * Prototypes for internal routines.
- */
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                              /* number of records fitting in block */
-xfs_btree_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block);/* generic btree block pointer */
-/*
- * Internal routines.
- */
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
- */
-STATIC xfs_btree_block_t *                      /* generic btree block pointer */
-xfs_btree_get_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level in btree */
-        struct xfs_buf          **bpp); /* buffer containing the block */
-/*
 * Checking routine: return maxrecs for the block.
 */
 STATIC int                              /* number of records fitting in block */
@@ -457,35 +430,6 @@ xfs_btree_dup_cursor(
 }
 /*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_firstrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level)  /* level to change */
-{
-        xfs_btree_block_t       *block; /* generic btree block pointer */
-        xfs_buf_t               *bp;    /* buffer containing block */
-        /*
-         * Get the block pointer for this level.
-         */
-        block = xfs_btree_get_block(cur, level, &bp);
-        xfs_btree_check_block(cur, block, level, bp);
-        /*
-         * It's empty, there is no such record.
-         */
-        if (!block->bb_h.bb_numrecs)
-                return 0;
-        /*
-         * Set the ptr value to 1, that's the first record/key.
-         */
-        cur->bc_ptrs[level] = 1;
-        return 1;
-}
-/*
 * Retrieve the block pointer from the cursor at the given level.
 * This may be a bmap btree root or from a buffer.
 */
@@ -626,6 +570,13 @@ xfs_btree_init_cursor(
                cur->bc_private.a.agbp = agbp;
                cur->bc_private.a.agno = agno;
                break;
+        case XFS_BTNUM_INO:
+                /*
+                 * Inode allocation btree fields.
+                 */
+                cur->bc_private.a.agbp = agbp;
+                cur->bc_private.a.agno = agno;
+                break;
        case XFS_BTNUM_BMAP:
                /*
                 * Bmap btree fields.
@@ -638,13 +589,6 @@ xfs_btree_init_cursor(
                cur->bc_private.b.flags = 0;
                cur->bc_private.b.whichfork = whichfork;
                break;
-        case XFS_BTNUM_INO:
-                /*
-                 * Inode allocation btree fields.
-                 */
-                cur->bc_private.i.agbp = agbp;
-                cur->bc_private.i.agno = agno;
-                break;
        default:
                ASSERT(0);
        }
@@ -671,6 +615,35 @@ xfs_btree_islastblock(
 }
 /*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_firstrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        xfs_btree_block_t       *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_h.bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to 1, that's the first record/key.
+         */
+        cur->bc_ptrs[level] = 1;
+        return 1;
+}
+/*
 * Change the cursor to point to the last record in the current block
 * at the given level.  Other levels are unaffected.
 */
@@ -890,12 +863,12 @@ xfs_btree_readahead_core(
        case XFS_BTNUM_INO:
                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
                                be32_to_cpu(i->bb_leftsib), 1);
                        rval++;
                }
                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
                                be32_to_cpu(i->bb_rightsib), 1);
                        rval++;
                }
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7440b78f9cec..1f528a2a3754 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -158,8 +158,8 @@ typedef struct xfs_btree_cur
        __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
        xfs_btnum_t     bc_btnum;       /* identifies which btree type */
        union {
-                struct {                        /* needed for BNO, CNT */
+                struct {                        /* needed for BNO, CNT, INO */
-                        struct xfs_buf  *agbp;  /* agf buffer pointer */
+                        struct xfs_buf  *agbp;  /* agf/agi buffer pointer */
                        xfs_agnumber_t  agno;   /* ag number */
                } a;
                struct {                        /* needed for BMAP */
@@ -172,10 +172,6 @@ typedef struct xfs_btree_cur
                        char            flags;          /* flags */
 #define XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
                } b;
-                struct {                        /* needed for INO */
-                        struct xfs_buf  *agbp;  /* agi buffer pointer */
-                        xfs_agnumber_t  agno;   /* ag number */
-                } i;
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 53a71c62025d..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,12 +732,13 @@ xfs_buf_item_init(
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
        bip->bli_buf = bp;
+        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
        bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
        bip->bli_format.blf_map_size = map_size;
 #ifdef XFS_BLI_TRACE
-        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_TRANS_DEBUG
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
        return (bip->bli_flags & XFS_BLI_DIRTY);
 }
+STATIC void
+xfs_buf_item_free(
+        xfs_buf_log_item_t      *bip)
+{
+#ifdef XFS_TRANS_DEBUG
+        kmem_free(bip->bli_orig);
+        kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+#ifdef XFS_BLI_TRACE
+        ktrace_free(bip->bli_trace);
+#endif
+        kmem_zone_free(xfs_buf_item_zone, bip);
+}
 /*
 * This is called when the buf log item is no longer needed.  It should
 * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
            (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
                XFS_BUF_CLR_IODONE_FUNC(bp);
        }
+        xfs_buf_rele(bp);
-#ifdef XFS_TRANS_DEBUG
+        xfs_buf_item_free(bip);
-        kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
-        bip->bli_orig = NULL;
-        kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
-        bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
-        kmem_zone_free(xfs_buf_item_zone, bip);
 }
@@ -1056,7 +1062,7 @@ xfs_buf_iodone_callbacks(
                           anyway. */
                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
                        XFS_BUF_DONE(bp);
-                        XFS_BUF_V_IODONESEMA(bp);
+                        XFS_BUF_FINISH_IOWAIT(bp);
                }
                return;
        }
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
        ASSERT(bip->bli_buf == bp);
+        xfs_buf_rele(bp);
        mp = bip->bli_item.li_mountp;
        /*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
         * xfs_trans_delete_ail() drops the AIL lock.
         */
        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+        xfs_buf_item_free(bip);
-#ifdef XFS_TRANS_DEBUG
-        kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
-        bip->bli_orig = NULL;
-        kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
-        bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-#ifdef XFS_BLI_TRACE
-        ktrace_free(bip->bli_trace);
-#endif
-        kmem_zone_free(xfs_buf_item_zone, bip);
 }
 #if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index d5d1e60ee224..d2ce5dd70d87 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -78,6 +78,7 @@ struct xfs_mount_args {
 #define XFSMNT_IOSIZE           0x00002000      /* optimize for I/O size */
 #define XFSMNT_OSYNCISOSYNC     0x00004000      /* o_sync is REALLY o_sync */
                                                /* (osyncisdsync is default) */
+#define XFSMNT_NOATTR2          0x00008000      /* turn off ATTR2 EA format */
 #define XFSMNT_32BITINODES      0x00200000      /* restrict inodes to 32
                                                 * bits of address space */
 #define XFSMNT_GQUOTA           0x00400000      /* group quota accounting */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 021a8f7e563f..9e561a9cefca 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1431,7 +1431,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
        }
        if (level < 0) {
                *result = XFS_ERROR(ENOENT);    /* we're out of our tree */
-                ASSERT(args->oknoent);
+                ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
                return(0);
        }
@@ -1530,6 +1530,28 @@ xfs_da_hashname(const uchar_t *name, int namelen)
        }
 }
+enum xfs_dacmp
+xfs_da_compname(
+        struct xfs_da_args *args,
+        const char      *name,
+        int             len)
+{
+        return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+                                        XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+static xfs_dahash_t
+xfs_default_hashname(
+        struct xfs_name *name)
+{
+        return xfs_da_hashname(name->name, name->len);
+}
+const struct xfs_nameops xfs_default_nameops = {
+        .hashname       = xfs_default_hashname,
+        .compname       = xfs_da_compname
+};
 /*
 * Add a block to the btree ahead of the file.
 * Return the new block number to the caller.
@@ -1598,7 +1620,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
                                        args->firstblock, args->total,
                                        &mapp[mapi], &nmap, args->flist,
                                        NULL))) {
-                                kmem_free(mapp, sizeof(*mapp) * count);
+                                kmem_free(mapp);
                                return error;
                        }
                        if (nmap < 1)
@@ -1620,11 +1642,11 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
            mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
            bno + count) {
                if (mapp != &map)
-                        kmem_free(mapp, sizeof(*mapp) * count);
+                        kmem_free(mapp);
                return XFS_ERROR(ENOSPC);
        }
        if (mapp != &map)
-                kmem_free(mapp, sizeof(*mapp) * count);
+                kmem_free(mapp);
        *new_blkno = (xfs_dablk_t)bno;
        return 0;
 }
@@ -2090,10 +2112,10 @@ xfs_da_do_buf(
                }
        }
        if (bplist) {
-                kmem_free(bplist, sizeof(*bplist) * nmap);
+                kmem_free(bplist);
        }
        if (mapp != &map) {
-                kmem_free(mapp, sizeof(*mapp) * nfsb);
+                kmem_free(mapp);
        }
        if (bpp)
                *bpp = rbp;
@@ -2102,11 +2124,11 @@ exit1:
        if (bplist) {
                for (i = 0; i < nbplist; i++)
                        xfs_trans_brelse(trans, bplist[i]);
-                kmem_free(bplist, sizeof(*bplist) * nmap);
+                kmem_free(bplist);
        }
 exit0:
        if (mapp != &map)
-                kmem_free(mapp, sizeof(*mapp) * nfsb);
+                kmem_free(mapp);
        if (bpp)
                *bpp = NULL;
        return error;
@@ -2218,7 +2240,7 @@ xfs_da_state_free(xfs_da_state_t *state)
 #ifdef XFS_DABUF_DEBUG
 xfs_dabuf_t     *xfs_dabuf_global_list;
-spinlock_t      xfs_dabuf_global_lock;
+static DEFINE_SPINLOCK(xfs_dabuf_global_lock);
 #endif
 /*
@@ -2315,7 +2337,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf)
        if (dabuf->dirty)
                xfs_da_buf_clean(dabuf);
        if (dabuf->nbuf > 1)
-                kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
+                kmem_free(dabuf->data);
 #ifdef XFS_DABUF_DEBUG
        {
                spin_lock(&xfs_dabuf_global_lock);
@@ -2332,7 +2354,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf)
        if (dabuf->nbuf == 1)
                kmem_zone_free(xfs_dabuf_zone, dabuf);
        else
-                kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+                kmem_free(dabuf);
 }
 /*
@@ -2403,7 +2425,7 @@ xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
        for (i = 0; i < nbuf; i++)
                xfs_trans_brelse(tp, bplist[i]);
        if (bplist != &bp)
-                kmem_free(bplist, nbuf * sizeof(*bplist));
+                kmem_free(bplist);
 }
 /*
@@ -2429,7 +2451,7 @@ xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
        for (i = 0; i < nbuf; i++)
                xfs_trans_binval(tp, bplist[i]);
        if (bplist != &bp)
-                kmem_free(bplist, nbuf * sizeof(*bplist));
+                kmem_free(bplist);
 }
 /*
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 7facf86f74f9..8be0b00ede9a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -99,6 +99,15 @@ typedef struct xfs_da_node_entry xfs_da_node_entry_t;
 *========================================================================*/
 /*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+        XFS_CMP_DIFFERENT,      /* names are completely different */
+        XFS_CMP_EXACT,          /* names are exactly the same */
+        XFS_CMP_CASE            /* names are same but differ in case */
+};
+/*
 * Structure to ease passing around component names.
 */
 typedef struct xfs_da_args {
@@ -123,13 +132,20 @@ typedef struct xfs_da_args {
        int             index2;         /* index of 2nd attr in blk */
        xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
        int             rmtblkcnt2;     /* remote attr value block count */
-        unsigned char   justcheck;      /* T/F: check for ok with no space */
+        int             op_flags;       /* operation flags */
-        unsigned char   rename;         /* T/F: this is an atomic rename op */
+        enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
-        unsigned char   addname;        /* T/F: this is an add operation */
-        unsigned char   oknoent;        /* T/F: ok to return ENOENT, else die */
 } xfs_da_args_t;
 /*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK     0x0001  /* check for ok with no space */
+#define XFS_DA_OP_RENAME        0x0002  /* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME       0x0004  /* this is an add operation */
+#define XFS_DA_OP_OKNOENT       0x0008  /* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP      0x0010  /* lookup to return CI name if found */
+/*
 * Structure to describe buffer(s) for a block.
 * This is needed in the directory version 2 format case, when
 * multiple non-contiguous fsblocks might be needed to cover one
@@ -201,6 +217,14 @@ typedef struct xfs_da_state {
                (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
                (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+        xfs_dahash_t    (*hashname)(struct xfs_name *);
+        enum xfs_dacmp  (*compname)(struct xfs_da_args *, const char *, int);
+};
 #ifdef __KERNEL__
 /*========================================================================
@@ -249,6 +273,10 @@ int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                          xfs_dabuf_t *dead_buf);
 uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+                                const char *name, int len);
 xfs_da_state_t *xfs_da_state_alloc(void);
 void xfs_da_state_free(xfs_da_state_t *state);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 5f3647cb9885..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -116,7 +116,7 @@ xfs_swapext(
 out_put_file:
        fput(file);
 out_free_sxp:
-        kmem_free(sxp, sizeof(xfs_swapext_t));
+        kmem_free(sxp);
 out:
        return error;
 }
@@ -128,10 +128,8 @@ xfs_swap_extents(
        xfs_swapext_t   *sxp)
 {
        xfs_mount_t     *mp;
-        xfs_inode_t     *ips[2];
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
-        bhv_vnode_t     *vp, *tvp;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
        int             ilf_fields, tilf_fields;
        static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -150,19 +148,15 @@ xfs_swap_extents(
        }
        sbp = &sxp->sx_stat;
-        vp = XFS_ITOV(ip);
-        tvp = XFS_ITOV(tip);
-        /* Lock in i_ino order */
-        if (ip->i_ino < tip->i_ino) {
-                ips[0] = ip;
-                ips[1] = tip;
-        } else {
-                ips[0] = tip;
-                ips[1] = ip;
-        }
-        xfs_lock_inodes(ips, 2, lock_flags);
+        /*
+         * we have to do two separate lock calls here to keep lockdep
+         * happy. If we try to get all the locks in one call, lock will
+         * report false positives when we drop the ILOCK and regain them
+         * below.
+         */
+        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        locked = 1;
        /* Verify that both files have the same format */
@@ -184,7 +178,7 @@ xfs_swap_extents(
                goto error0;
        }
-        if (VN_CACHED(tvp) != 0) {
+        if (VN_CACHED(VFS_I(tip)) != 0) {
                xfs_inval_cached_trace(tip, 0, -1, 0, -1);
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
@@ -193,7 +187,7 @@ xfs_swap_extents(
        }
        /* Verify O_DIRECT for ftmp */
-        if (VN_CACHED(tvp) != 0) {
+        if (VN_CACHED(VFS_I(tip)) != 0) {
                error = XFS_ERROR(EINVAL);
                goto error0;
        }
@@ -237,7 +231,7 @@ xfs_swap_extents(
         * vop_read (or write in the case of autogrow) they block on the iolock
         * until we have switched the extents.
         */
-        if (VN_MAPPED(vp)) {
+        if (VN_MAPPED(VFS_I(ip))) {
                error = XFS_ERROR(EBUSY);
                goto error0;
        }
@@ -265,7 +259,7 @@ xfs_swap_extents(
                locked = 0;
                goto error0;
        }
-        xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        /*
         * Count the number of extended attribute blocks
@@ -350,15 +344,11 @@ xfs_swap_extents(
                break;
        }
-        /*
-         * Increment vnode ref counts since xfs_trans_commit &
-         * xfs_trans_cancel will both unlock the inodes and
-         * decrement the associated ref counts.
-         */
-        VN_HOLD(vp);
-        VN_HOLD(tvp);
+        IHOLD(ip);
        xfs_trans_ijoin(tp, ip, lock_flags);
+        IHOLD(tip);
        xfs_trans_ijoin(tp, tip, lock_flags);
        xfs_trans_log_inode(tp, ip,  ilf_fields);
@@ -381,6 +371,6 @@ xfs_swap_extents(
                xfs_iunlock(tip, lock_flags);
        }
        if (tempifp != NULL)
-                kmem_free(tempifp, sizeof(xfs_ifork_t));
+                kmem_free(tempifp);
        return error;
 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 7cb26529766b..80e0dc51361c 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,6 +46,54 @@
 struct xfs_name xfs_name_dotdot = {"..", 2};
+extern const struct xfs_nameops xfs_default_nameops;
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+        struct xfs_name *name)
+{
+        xfs_dahash_t    hash;
+        int             i;
+        for (i = 0, hash = 0; i < name->len; i++)
+                hash = tolower(name->name[i]) ^ rol32(hash, 7);
+        return hash;
+}
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+        struct xfs_da_args *args,
+        const char      *name,
+        int             len)
+{
+        enum xfs_dacmp  result;
+        int             i;
+        if (args->namelen != len)
+                return XFS_CMP_DIFFERENT;
+        result = XFS_CMP_EXACT;
+        for (i = 0; i < len; i++) {
+                if (args->name[i] == name[i])
+                        continue;
+                if (tolower(args->name[i]) != tolower(name[i]))
+                        return XFS_CMP_DIFFERENT;
+                result = XFS_CMP_CASE;
+        }
+        return result;
+}
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+        .hashname       = xfs_ascii_ci_hashname,
+        .compname       = xfs_ascii_ci_compname,
+};
 void
 xfs_dir_mount(
        xfs_mount_t     *mp)
@@ -65,6 +113,10 @@ xfs_dir_mount(
                (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
                (uint)sizeof(xfs_da_node_entry_t);
        mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+        if (xfs_sb_version_hasasciici(&mp->m_sb))
+                mp->m_dirnameops = &xfs_ascii_ci_nameops;
+        else
+                mp->m_dirnameops = &xfs_default_nameops;
 }
 /*
@@ -162,9 +214,10 @@ xfs_dir_createname(
                return rval;
        XFS_STATS_INC(xs_dir_create);
+        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name->name, name->len);
+        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.inumber = inum;
        args.dp = dp;
        args.firstblock = first;
@@ -172,8 +225,7 @@ xfs_dir_createname(
        args.total = total;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.justcheck = 0;
+        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-        args.addname = args.oknoent = 1;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_addname(&args);
@@ -191,14 +243,43 @@ xfs_dir_createname(
 }
 /*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+        struct xfs_da_args *args,
+        const char      *name,
+        int             len)
+{
+        if (args->cmpresult == XFS_CMP_DIFFERENT)
+                return ENOENT;
+        if (args->cmpresult != XFS_CMP_CASE ||
+                                        !(args->op_flags & XFS_DA_OP_CILOOKUP))
+                return EEXIST;
+        args->value = kmem_alloc(len, KM_MAYFAIL);
+        if (!args->value)
+                return ENOMEM;
+        memcpy(args->value, name, len);
+        args->valuelen = len;
+        return EEXIST;
+}
+/*
 * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
 */
 int
 xfs_dir_lookup(
        xfs_trans_t     *tp,
        xfs_inode_t     *dp,
        struct xfs_name *name,
-        xfs_ino_t       *inum)          /* out: inode number */
+        xfs_ino_t       *inum,          /* out: inode number */
+        struct xfs_name *ci_name)       /* out: actual name if CI match */
 {
        xfs_da_args_t   args;
        int             rval;
@@ -206,15 +287,17 @@ xfs_dir_lookup(
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_lookup);
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name->name, name->len);
+        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.dp = dp;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.oknoent = 1;
+        args.op_flags = XFS_DA_OP_OKNOENT;
+        if (ci_name)
+                args.op_flags |= XFS_DA_OP_CILOOKUP;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_lookup(&args);
@@ -230,8 +313,13 @@ xfs_dir_lookup(
                rval = xfs_dir2_node_lookup(&args);
        if (rval == EEXIST)
                rval = 0;
-        if (rval == 0)
+        if (!rval) {
                *inum = args.inumber;
+                if (ci_name) {
+                        ci_name->name = args.value;
+                        ci_name->len = args.valuelen;
+                }
+        }
        return rval;
 }
@@ -255,9 +343,10 @@ xfs_dir_removename(
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
        XFS_STATS_INC(xs_dir_remove);
+        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name->name, name->len);
+        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.inumber = ino;
        args.dp = dp;
        args.firstblock = first;
@@ -265,7 +354,6 @@ xfs_dir_removename(
        args.total = total;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.justcheck = args.addname = args.oknoent = 0;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_removename(&args);
@@ -338,9 +426,10 @@ xfs_dir_replace(
        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
                return rval;
+        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name->name, name->len);
+        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.inumber = inum;
        args.dp = dp;
        args.firstblock = first;
@@ -348,7 +437,6 @@ xfs_dir_replace(
        args.total = total;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.justcheck = args.addname = args.oknoent = 0;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_replace(&args);
@@ -384,15 +472,16 @@ xfs_dir_canenter(
                return 0;
        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        memset(&args, 0, sizeof(xfs_da_args_t));
        args.name = name->name;
        args.namelen = name->len;
-        args.hashval = xfs_da_hashname(name->name, name->len);
+        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
        args.dp = dp;
        args.whichfork = XFS_DATA_FORK;
        args.trans = tp;
-        args.justcheck = args.addname = args.oknoent = 1;
+        args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+                                                        XFS_DA_OP_OKNOENT;
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_addname(&args);
@@ -493,7 +582,7 @@ xfs_dir2_grow_inode(
                                        args->firstblock, args->total,
                                        &mapp[mapi], &nmap, args->flist,
                                        NULL))) {
-                                kmem_free(mapp, sizeof(*mapp) * count);
+                                kmem_free(mapp);
                                return error;
                        }
                        if (nmap < 1)
@@ -525,14 +614,14 @@ xfs_dir2_grow_inode(
            mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
            bno + count) {
                if (mapp != &map)
-                        kmem_free(mapp, sizeof(*mapp) * count);
+                        kmem_free(mapp);
                return XFS_ERROR(ENOSPC);
        }
        /*
         * Done with the temporary mapping table.
         */
        if (mapp != &map)
-                kmem_free(mapp, sizeof(*mapp) * count);
+                kmem_free(mapp);
        *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
        /*
         * Update file's size if this is the data space and it grew.
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 6392f939029f..1d9ef96f33aa 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -74,7 +74,8 @@ extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
                                xfs_fsblock_t *first,
                                struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-                                struct xfs_name *name, xfs_ino_t *inum);
+                                struct xfs_name *name, xfs_ino_t *inum,
+                                struct xfs_name *ci_name);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
                                struct xfs_name *name, xfs_ino_t ino,
                                xfs_fsblock_t *first,
@@ -99,4 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
                                struct xfs_dabuf *bp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name,
+                                int len);
 #endif  /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index fb5a556725b3..e2fa0a1d8e96 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -215,7 +215,7 @@ xfs_dir2_block_addname(
        /*
         * If this isn't a real add, we're done with the buffer.
         */
-        if (args->justcheck)
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
                xfs_da_brelse(tp, bp);
        /*
         * If we don't have space for the new entry & leaf ...
@@ -225,7 +225,7 @@ xfs_dir2_block_addname(
                 * Not trying to actually do anything, or don't have
                 * a space reservation: return no-space.
                 */
-                if (args->justcheck || args->total == 0)
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
                        return XFS_ERROR(ENOSPC);
                /*
                 * Convert to the next larger format.
@@ -240,7 +240,7 @@ xfs_dir2_block_addname(
        /*
         * Just checking, and it would work, so say so.
         */
-        if (args->justcheck)
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
                return 0;
        needlog = needscan = 0;
        /*
@@ -610,14 +610,15 @@ xfs_dir2_block_lookup(
        /*
         * Get the offset from the leaf entry, to point to the data.
         */
-        dep = (xfs_dir2_data_entry_t *)
+        dep = (xfs_dir2_data_entry_t *)((char *)block +
-              ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+                xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
        /*
-         * Fill in inode number, release the block.
+         * Fill in inode number, CI name if appropriate, release the block.
         */
        args->inumber = be64_to_cpu(dep->inumber);
+        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
        xfs_da_brelse(args->trans, bp);
-        return XFS_ERROR(EEXIST);
+        return XFS_ERROR(error);
 }
 /*
@@ -643,6 +644,7 @@ xfs_dir2_block_lookup_int(
        int                     mid;            /* binary search current idx */
        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_trans_t             *tp;            /* transaction pointer */
+        enum xfs_dacmp          cmp;            /* comparison result */
        dp = args->dp;
        tp = args->trans;
@@ -673,7 +675,7 @@ xfs_dir2_block_lookup_int(
                else
                        high = mid - 1;
                if (low > high) {
-                        ASSERT(args->oknoent);
+                        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
                        xfs_da_brelse(tp, bp);
                        return XFS_ERROR(ENOENT);
                }
@@ -697,20 +699,31 @@ xfs_dir2_block_lookup_int(
                dep = (xfs_dir2_data_entry_t *)
                        ((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
                /*
-                 * Compare, if it's right give back buffer & entry number.
+                 * Compare name and if it's an exact match, return the index
+                 * and buffer. If it's the first case-insensitive match, store
+                 * the index and buffer and continue looking for an exact match.
                 */
-                if (dep->namelen == args->namelen &&
+                cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-                    dep->name[0] == args->name[0] &&
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-                    memcmp(dep->name, args->name, args->namelen) == 0) {
+                        args->cmpresult = cmp;
                        *bpp = bp;
                        *entno = mid;
-                        return 0;
+                        if (cmp == XFS_CMP_EXACT)
+                                return 0;
                }
-        } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash);
+        } while (++mid < be32_to_cpu(btp->count) &&
+                        be32_to_cpu(blp[mid].hashval) == hash);
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        /*
+         * Here, we can only be doing a lookup (not a rename or replace).
+         * If a case-insensitive match was found earlier, return success.
+         */
+        if (args->cmpresult == XFS_CMP_CASE)
+                return 0;
        /*
         * No match, release the buffer and return ENOENT.
         */
-        ASSERT(args->oknoent);
        xfs_da_brelse(tp, bp);
        return XFS_ERROR(ENOENT);
 }
@@ -1033,6 +1046,7 @@ xfs_dir2_sf_to_block(
        xfs_dir2_sf_t           *sfp;           /* shortform structure */
        __be16                  *tagp;          /* end of data entry */
        xfs_trans_t             *tp;            /* transaction pointer */
+        struct xfs_name         name;
        xfs_dir2_trace_args("sf_to_block", args);
        dp = args->dp;
@@ -1071,7 +1085,7 @@ xfs_dir2_sf_to_block(
         */
        error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
        if (error) {
-                kmem_free(buf, buf_len);
+                kmem_free(buf);
                return error;
        }
        /*
@@ -1079,7 +1093,7 @@ xfs_dir2_sf_to_block(
         */
        error = xfs_dir2_data_init(args, blkno, &bp);
        if (error) {
-                kmem_free(buf, buf_len);
+                kmem_free(buf);
                return error;
        }
        block = bp->data;
@@ -1187,8 +1201,10 @@ xfs_dir2_sf_to_block(
                tagp = xfs_dir2_data_entry_tag_p(dep);
                *tagp = cpu_to_be16((char *)dep - (char *)block);
                xfs_dir2_data_log_entry(tp, bp, dep);
-                blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname(
+                name.name = sfep->name;
-                                        (char *)sfep->name, sfep->namelen));
+                name.len = sfep->namelen;
+                blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+                                                        hashname(&name));
                blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
                                                 (char *)dep - (char *)block));
                offset = (int)((char *)(tagp + 1) - (char *)block);
@@ -1198,7 +1214,7 @@ xfs_dir2_sf_to_block(
                        sfep = xfs_dir2_sf_nextentry(sfp, sfep);
        }
        /* Done with the temporary buffer */
-        kmem_free(buf, buf_len);
+        kmem_free(buf);
        /*
         * Sort the leaf entries by hash value.
         */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index fb8c9e08b23d..498f8d694330 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -65,6 +65,7 @@ xfs_dir2_data_check(
        xfs_mount_t             *mp;            /* filesystem mount point */
        char                    *p;             /* current data position */
        int                     stale;          /* count of stale leaves */
+        struct xfs_name         name;
        mp = dp->i_mount;
        d = bp->data;
@@ -140,7 +141,9 @@ xfs_dir2_data_check(
                        addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                (xfs_dir2_data_aoff_t)
                                ((char *)dep - (char *)d));
-                        hash = xfs_da_hashname((char *)dep->name, dep->namelen);
+                        name.name = dep->name;
+                        name.len = dep->namelen;
+                        hash = mp->m_dirnameops->hashname(&name);
                        for (i = 0; i < be32_to_cpu(btp->count); i++) {
                                if (be32_to_cpu(lep[i].address) == addr &&
                                    be32_to_cpu(lep[i].hashval) == hash)
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index bc52b803d79b..93535992cb60 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -263,20 +263,21 @@ xfs_dir2_leaf_addname(
         * If we don't have enough free bytes but we can make enough
         * by compacting out stale entries, we'll do that.
         */
-        if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < needbytes &&
+        if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
-            be16_to_cpu(leaf->hdr.stale) > 1) {
+                                needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
                compact = 1;
        }
        /*
         * Otherwise if we don't have enough free bytes we need to
         * convert to node form.
         */
-        else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
+        else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
-                 needbytes) {
+                                                leaf->hdr.count)] < needbytes) {
                /*
                 * Just checking or no space reservation, give up.
                 */
-                if (args->justcheck || args->total == 0) {
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+                                                        args->total == 0) {
                        xfs_da_brelse(tp, lbp);
                        return XFS_ERROR(ENOSPC);
                }
@@ -301,7 +302,7 @@ xfs_dir2_leaf_addname(
         * If just checking, then it will fit unless we needed to allocate
         * a new data block.
         */
-        if (args->justcheck) {
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                xfs_da_brelse(tp, lbp);
                return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
        }
@@ -1110,7 +1111,7 @@ xfs_dir2_leaf_getdents(
                *offset = XFS_DIR2_MAX_DATAPTR;
        else
                *offset = xfs_dir2_byte_to_dataptr(mp, curoff);
-        kmem_free(map, map_size * sizeof(*map));
+        kmem_free(map);
        if (bp)
                xfs_da_brelse(NULL, bp);
        return error;
@@ -1298,12 +1299,13 @@ xfs_dir2_leaf_lookup(
              ((char *)dbp->data +
               xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
        /*
-         * Return the found inode number.
+         * Return the found inode number & CI name if appropriate
         */
        args->inumber = be64_to_cpu(dep->inumber);
+        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
        xfs_da_brelse(tp, dbp);
        xfs_da_brelse(tp, lbp);
-        return XFS_ERROR(EEXIST);
+        return XFS_ERROR(error);
 }
 /*
@@ -1319,8 +1321,8 @@ xfs_dir2_leaf_lookup_int(
        int                     *indexp,        /* out: index in leaf block */
        xfs_dabuf_t             **dbpp)         /* out: data buffer */
 {
-        xfs_dir2_db_t           curdb;          /* current data block number */
+        xfs_dir2_db_t           curdb = -1;     /* current data block number */
-        xfs_dabuf_t             *dbp;           /* data buffer */
+        xfs_dabuf_t             *dbp = NULL;    /* data buffer */
        xfs_dir2_data_entry_t   *dep;           /* data entry */
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     error;          /* error return code */
@@ -1331,6 +1333,8 @@ xfs_dir2_leaf_lookup_int(
        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_dir2_db_t           newdb;          /* new data block number */
        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_db_t           cidb = -1;      /* case match data block no. */
+        enum xfs_dacmp          cmp;            /* name compare result */
        dp = args->dp;
        tp = args->trans;
@@ -1338,11 +1342,10 @@ xfs_dir2_leaf_lookup_int(
        /*
         * Read the leaf block into the buffer.
         */
-        if ((error =
+        error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-            xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+                                                        XFS_DATA_FORK);
-                    XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        *lbpp = lbp;
        leaf = lbp->data;
        xfs_dir2_leaf_check(dp, lbp);
@@ -1354,9 +1357,9 @@ xfs_dir2_leaf_lookup_int(
         * Loop over all the entries with the right hash value
         * looking to match the name.
         */
-        for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
+        for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
-             index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+                                be32_to_cpu(lep->hashval) == args->hashval;
-             lep++, index++) {
+                                lep++, index++) {
                /*
                 * Skip over stale leaf entries.
                 */
@@ -1373,10 +1376,10 @@ xfs_dir2_leaf_lookup_int(
                if (newdb != curdb) {
                        if (dbp)
                                xfs_da_brelse(tp, dbp);
-                        if ((error =
+                        error = xfs_da_read_buf(tp, dp,
-                            xfs_da_read_buf(tp, dp,
+                                                xfs_dir2_db_to_da(mp, newdb),
-                                    xfs_dir2_db_to_da(mp, newdb), -1, &dbp,
+                                                -1, &dbp, XFS_DATA_FORK);
-                                    XFS_DATA_FORK))) {
+                        if (error) {
                                xfs_da_brelse(tp, lbp);
                                return error;
                        }
@@ -1386,24 +1389,50 @@ xfs_dir2_leaf_lookup_int(
                /*
                 * Point to the data entry.
                 */
-                dep = (xfs_dir2_data_entry_t *)
+                dep = (xfs_dir2_data_entry_t *)((char *)dbp->data +
-                      ((char *)dbp->data +
+                        xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
-                       xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
                /*
-                 * If it matches then return it.
+                 * Compare name and if it's an exact match, return the index
+                 * and buffer. If it's the first case-insensitive match, store
+                 * the index and buffer and continue looking for an exact match.
                 */
-                if (dep->namelen == args->namelen &&
+                cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
-                    dep->name[0] == args->name[0] &&
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
-                    memcmp(dep->name, args->name, args->namelen) == 0) {
+                        args->cmpresult = cmp;
-                        *dbpp = dbp;
                        *indexp = index;
-                        return 0;
+                        /* case exact match: return the current buffer. */
+                        if (cmp == XFS_CMP_EXACT) {
+                                *dbpp = dbp;
+                                return 0;
+                        }
+                        cidb = curdb;
                }
        }
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        /*
+         * Here, we can only be doing a lookup (not a rename or remove).
+         * If a case-insensitive match was found earlier, re-read the
+         * appropriate data block if required and return it.
+         */
+        if (args->cmpresult == XFS_CMP_CASE) {
+                ASSERT(cidb != -1);
+                if (cidb != curdb) {
+                        xfs_da_brelse(tp, dbp);
+                        error = xfs_da_read_buf(tp, dp,
+                                                xfs_dir2_db_to_da(mp, cidb),
+                                                -1, &dbp, XFS_DATA_FORK);
+                        if (error) {
+                                xfs_da_brelse(tp, lbp);
+                                return error;
+                        }
+                }
+                *dbpp = dbp;
+                return 0;
+        }
        /*
         * No match found, return ENOENT.
         */
-        ASSERT(args->oknoent);
+        ASSERT(cidb == -1);
        if (dbp)
                xfs_da_brelse(tp, dbp);
        xfs_da_brelse(tp, lbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 8dade711f099..fa6c3a5ddbc6 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -226,7 +226,7 @@ xfs_dir2_leafn_add(
        ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
               be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
-        if (args->justcheck)
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
                return 0;
        /*
@@ -387,28 +387,26 @@ xfs_dir2_leafn_lasthash(
 }
 /*
- * Look up a leaf entry in a node-format leaf block.
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
- * If this is an addname then the extrablk in state is a freespace block,
+ * The extrablk in state is a freespace block.
- * otherwise it's a data block.
 */
-int
+STATIC int
-xfs_dir2_leafn_lookup_int(
+xfs_dir2_leafn_lookup_for_addname(
        xfs_dabuf_t             *bp,            /* leaf buffer */
        xfs_da_args_t           *args,          /* operation arguments */
        int                     *indexp,        /* out: leaf entry index */
        xfs_da_state_t          *state)         /* state to fill in */
 {
-        xfs_dabuf_t             *curbp;         /* current data/free buffer */
+        xfs_dabuf_t             *curbp = NULL;  /* current data/free buffer */
-        xfs_dir2_db_t           curdb;          /* current data block number */
+        xfs_dir2_db_t           curdb = -1;     /* current data block number */
-        xfs_dir2_db_t           curfdb;         /* current free block number */
+        xfs_dir2_db_t           curfdb = -1;    /* current free block number */
-        xfs_dir2_data_entry_t   *dep;           /* data block entry */
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     error;          /* error return value */
        int                     fi;             /* free entry index */
-        xfs_dir2_free_t         *free=NULL;     /* free block structure */
+        xfs_dir2_free_t         *free = NULL;   /* free block structure */
        int                     index;          /* leaf entry index */
        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
-        int                     length=0;       /* length of new data entry */
+        int                     length;         /* length of new data entry */
        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
        xfs_mount_t             *mp;            /* filesystem mount point */
        xfs_dir2_db_t           newdb;          /* new data block number */
@@ -431,33 +429,20 @@ xfs_dir2_leafn_lookup_int(
        /*
         * Do we have a buffer coming in?
         */
-        if (state->extravalid)
+        if (state->extravalid) {
+                /* If so, it's a free block buffer, get the block number. */
                curbp = state->extrablk.bp;
-        else
+                curfdb = state->extrablk.blkno;
-                curbp = NULL;
+                free = curbp->data;
-        /*
+                ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
-         * For addname, it's a free block buffer, get the block number.
-         */
-        if (args->addname) {
-                curfdb = curbp ? state->extrablk.blkno : -1;
-                curdb = -1;
-                length = xfs_dir2_data_entsize(args->namelen);
-                if ((free = (curbp ? curbp->data : NULL)))
-                        ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
-        }
-        /*
-         * For others, it's a data block buffer, get the block number.
-         */
-        else {
-                curfdb = -1;
-                curdb = curbp ? state->extrablk.blkno : -1;
        }
+        length = xfs_dir2_data_entsize(args->namelen);
        /*
         * Loop over leaf entries with the right hash value.
         */
-        for (lep = &leaf->ents[index];
+        for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
-             index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+                                be32_to_cpu(lep->hashval) == args->hashval;
-             lep++, index++) {
+                                lep++, index++) {
                /*
                 * Skip stale leaf entries.
                 */
@@ -471,161 +456,244 @@ xfs_dir2_leafn_lookup_int(
                 * For addname, we're looking for a place to put the new entry.
                 * We want to use a data block with an entry of equal
                 * hash value to ours if there is one with room.
+                 *
+                 * If this block isn't the data block we already have
+                 * in hand, take a look at it.
                 */
-                if (args->addname) {
+                if (newdb != curdb) {
+                        curdb = newdb;
                        /*
-                         * If this block isn't the data block we already have
+                         * Convert the data block to the free block
-                         * in hand, take a look at it.
+                         * holding its freespace information.
                         */
-                        if (newdb != curdb) {
+                        newfdb = xfs_dir2_db_to_fdb(mp, newdb);
-                                curdb = newdb;
-                                /*
-                                 * Convert the data block to the free block
-                                 * holding its freespace information.
-                                 */
-                                newfdb = xfs_dir2_db_to_fdb(mp, newdb);
-                                /*
-                                 * If it's not the one we have in hand,
-                                 * read it in.
-                                 */
-                                if (newfdb != curfdb) {
-                                        /*
-                                         * If we had one before, drop it.
-                                         */
-                                        if (curbp)
-                                                xfs_da_brelse(tp, curbp);
-                                        /*
-                                         * Read the free block.
-                                         */
-                                        if ((error = xfs_da_read_buf(tp, dp,
-                                                        xfs_dir2_db_to_da(mp,
-                                                                newfdb),
-                                                        -1, &curbp,
-                                                        XFS_DATA_FORK))) {
-                                                return error;
-                                        }
-                                        free = curbp->data;
-                                        ASSERT(be32_to_cpu(free->hdr.magic) ==
-                                               XFS_DIR2_FREE_MAGIC);
-                                        ASSERT((be32_to_cpu(free->hdr.firstdb) %
-                                                XFS_DIR2_MAX_FREE_BESTS(mp)) ==
-                                               0);
-                                        ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
-                                        ASSERT(curdb <
-                                               be32_to_cpu(free->hdr.firstdb) +
-                                               be32_to_cpu(free->hdr.nvalid));
-                                }
-                                /*
-                                 * Get the index for our entry.
-                                 */
-                                fi = xfs_dir2_db_to_fdindex(mp, curdb);
-                                /*
-                                 * If it has room, return it.
-                                 */
-                                if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
-                                        XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
-                                                         XFS_ERRLEVEL_LOW, mp);
-                                        if (curfdb != newfdb)
-                                                xfs_da_brelse(tp, curbp);
-                                        return XFS_ERROR(EFSCORRUPTED);
-                                }
-                                curfdb = newfdb;
-                                if (be16_to_cpu(free->bests[fi]) >= length) {
-                                        *indexp = index;
-                                        state->extravalid = 1;
-                                        state->extrablk.bp = curbp;
-                                        state->extrablk.blkno = curfdb;
-                                        state->extrablk.index = fi;
-                                        state->extrablk.magic =
-                                                XFS_DIR2_FREE_MAGIC;
-                                        ASSERT(args->oknoent);
-                                        return XFS_ERROR(ENOENT);
-                                }
-                        }
-                }
-                /*
-                 * Not adding a new entry, so we really want to find
-                 * the name given to us.
-                 */
-                else {
                        /*
-                         * If it's a different data block, go get it.
+                         * If it's not the one we have in hand, read it in.
                         */
-                        if (newdb != curdb) {
+                        if (newfdb != curfdb) {
                                /*
-                                 * If we had a block before, drop it.
+                                 * If we had one before, drop it.
                                 */
                                if (curbp)
                                        xfs_da_brelse(tp, curbp);
                                /*
-                                 * Read the data block.
+                                 * Read the free block.
                                 */
-                                if ((error =
+                                error = xfs_da_read_buf(tp, dp,
-                                    xfs_da_read_buf(tp, dp,
+                                                xfs_dir2_db_to_da(mp, newfdb),
-                                            xfs_dir2_db_to_da(mp, newdb), -1,
+                                                -1, &curbp, XFS_DATA_FORK);
-                                            &curbp, XFS_DATA_FORK))) {
+                                if (error)
                                        return error;
-                                }
+                                free = curbp->data;
-                                xfs_dir2_data_check(dp, curbp);
+                                ASSERT(be32_to_cpu(free->hdr.magic) ==
-                                curdb = newdb;
+                                        XFS_DIR2_FREE_MAGIC);
+                                ASSERT((be32_to_cpu(free->hdr.firstdb) %
+                                        XFS_DIR2_MAX_FREE_BESTS(mp)) == 0);
+                                ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
+                                ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
+                                        be32_to_cpu(free->hdr.nvalid));
                        }
                        /*
-                         * Point to the data entry.
+                         * Get the index for our entry.
                         */
-                        dep = (xfs_dir2_data_entry_t *)
+                        fi = xfs_dir2_db_to_fdindex(mp, curdb);
-                              ((char *)curbp->data +
-                               xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
                        /*
-                         * Compare the entry, return it if it matches.
+                         * If it has room, return it.
                         */
-                        if (dep->namelen == args->namelen &&
+                        if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
-                            dep->name[0] == args->name[0] &&
+                                XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
-                            memcmp(dep->name, args->name, args->namelen) == 0) {
+                                                        XFS_ERRLEVEL_LOW, mp);
-                                args->inumber = be64_to_cpu(dep->inumber);
+                                if (curfdb != newfdb)
-                                *indexp = index;
+                                        xfs_da_brelse(tp, curbp);
-                                state->extravalid = 1;
+                                return XFS_ERROR(EFSCORRUPTED);
-                                state->extrablk.bp = curbp;
-                                state->extrablk.blkno = curdb;
-                                state->extrablk.index =
-                                        (int)((char *)dep -
-                                              (char *)curbp->data);
-                                state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-                                return XFS_ERROR(EEXIST);
                        }
+                        curfdb = newfdb;
+                        if (be16_to_cpu(free->bests[fi]) >= length)
+                                goto out;
                }
        }
+        /* Didn't find any space */
+        fi = -1;
+out:
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+        if (curbp) {
+                /* Giving back a free block. */
+                state->extravalid = 1;
+                state->extrablk.bp = curbp;
+                state->extrablk.index = fi;
+                state->extrablk.blkno = curfdb;
+                state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+        } else {
+                state->extravalid = 0;
+        }
        /*
-         * Didn't find a match.
+         * Return the index, that will be the insertion point.
-         * If we are holding a buffer, give it back in case our caller
-         * finds it useful.
         */
-        if ((state->extravalid = (curbp != NULL))) {
+        *indexp = index;
-                state->extrablk.bp = curbp;
+        return XFS_ERROR(ENOENT);
-                state->extrablk.index = -1;
+}
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     *indexp,        /* out: leaf entry index */
+        xfs_da_state_t          *state)         /* state to fill in */
+{
+        xfs_dabuf_t             *curbp = NULL;  /* current data/free buffer */
+        xfs_dir2_db_t           curdb = -1;     /* current data block number */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        int                     index;          /* leaf entry index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_db_t           newdb;          /* new data block number */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        enum xfs_dacmp          cmp;            /* comparison result */
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = bp->data;
+        ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+        ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+        xfs_dir2_leafn_check(dp, bp);
+        /*
+         * Look up the hash value in the leaf entries.
+         */
+        index = xfs_dir2_leaf_search_hash(args, bp);
+        /*
+         * Do we have a buffer coming in?
+         */
+        if (state->extravalid) {
+                curbp = state->extrablk.bp;
+                curdb = state->extrablk.blkno;
+        }
+        /*
+         * Loop over leaf entries with the right hash value.
+         */
+        for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+                                be32_to_cpu(lep->hashval) == args->hashval;
+                                lep++, index++) {
                /*
-                 * For addname, giving back a free block.
+                 * Skip stale leaf entries.
                 */
-                if (args->addname) {
+                if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
-                        state->extrablk.blkno = curfdb;
+                        continue;
-                        state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+                /*
+                 * Pull the data block number from the entry.
+                 */
+                newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+                /*
+                 * Not adding a new entry, so we really want to find
+                 * the name given to us.
+                 *
+                 * If it's a different data block, go get it.
+                 */
+                if (newdb != curdb) {
+                        /*
+                         * If we had a block before that we aren't saving
+                         * for a CI name, drop it
+                         */
+                        if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+                                                curdb != state->extrablk.blkno))
+                                xfs_da_brelse(tp, curbp);
+                        /*
+                         * If needing the block that is saved with a CI match,
+                         * use it otherwise read in the new data block.
+                         */
+                        if (args->cmpresult != XFS_CMP_DIFFERENT &&
+                                        newdb == state->extrablk.blkno) {
+                                ASSERT(state->extravalid);
+                                curbp = state->extrablk.bp;
+                        } else {
+                                error = xfs_da_read_buf(tp, dp,
+                                                xfs_dir2_db_to_da(mp, newdb),
+                                                -1, &curbp, XFS_DATA_FORK);
+                                if (error)
+                                        return error;
+                        }
+                        xfs_dir2_data_check(dp, curbp);
+                        curdb = newdb;
                }
                /*
-                 * For other callers, giving back a data block.
+                 * Point to the data entry.
                 */
-                else {
+                dep = (xfs_dir2_data_entry_t *)((char *)curbp->data +
+                        xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+                /*
+                 * Compare the entry and if it's an exact match, return
+                 * EEXIST immediately. If it's the first case-insensitive
+                 * match, store the block & inode number and continue looking.
+                 */
+                cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                        /* If there is a CI match block, drop it */
+                        if (args->cmpresult != XFS_CMP_DIFFERENT &&
+                                                curdb != state->extrablk.blkno)
+                                xfs_da_brelse(tp, state->extrablk.bp);
+                        args->cmpresult = cmp;
+                        args->inumber = be64_to_cpu(dep->inumber);
+                        *indexp = index;
+                        state->extravalid = 1;
+                        state->extrablk.bp = curbp;
                        state->extrablk.blkno = curdb;
+                        state->extrablk.index = (int)((char *)dep -
+                                                        (char *)curbp->data);
                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        if (cmp == XFS_CMP_EXACT)
+                                return XFS_ERROR(EEXIST);
                }
        }
-        /*
+        ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
-         * Return the final index, that will be the insertion point.
+                                        (args->op_flags & XFS_DA_OP_OKNOENT));
-         */
+        if (curbp) {
+                if (args->cmpresult == XFS_CMP_DIFFERENT) {
+                        /* Giving back last used data block. */
+                        state->extravalid = 1;
+                        state->extrablk.bp = curbp;
+                        state->extrablk.index = -1;
+                        state->extrablk.blkno = curdb;
+                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                } else {
+                        /* If the curbp is not the CI match block, drop it */
+                        if (state->extrablk.bp != curbp)
+                                xfs_da_brelse(tp, curbp);
+                }
+        } else {
+                state->extravalid = 0;
+        }
        *indexp = index;
-        ASSERT(index == be16_to_cpu(leaf->hdr.count) || args->oknoent);
        return XFS_ERROR(ENOENT);
 }
 /*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     *indexp,        /* out: leaf entry index */
+        xfs_da_state_t          *state)         /* state to fill in */
+{
+        if (args->op_flags & XFS_DA_OP_ADDNAME)
+                return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+                                                        state);
+        return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+/*
 * Move count leaf entries from source to destination leaf.
 * Log entries and headers.  Stale entries are preserved.
 */
@@ -823,9 +891,10 @@ xfs_dir2_leafn_rebalance(
         */
        if (!state->inleaf)
                blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
-        
-        /* 
+        /*
-         * Finally sanity check just to make sure we are not returning a negative index 
+         * Finally sanity check just to make sure we are not returning a
+         * negative index
         */
        if(blk2->index < 0) {
                state->inleaf = 1;
@@ -1332,7 +1401,7 @@ xfs_dir2_node_addname(
                /*
                 * It worked, fix the hash values up the btree.
                 */
-                if (!args->justcheck)
+                if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
                        xfs_da_fixhashpath(state, &state->path);
        } else {
                /*
@@ -1515,7 +1584,8 @@ xfs_dir2_node_addname_int(
                /*
                 * Not allowed to allocate, return failure.
                 */
-                if (args->justcheck || args->total == 0) {
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+                                                        args->total == 0) {
                        /*
                         * Drop the freespace buffer unless it came from our
                         * caller.
@@ -1661,7 +1731,7 @@ xfs_dir2_node_addname_int(
                /*
                 * If just checking, we succeeded.
                 */
-                if (args->justcheck) {
+                if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                        if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
                                xfs_da_buf_done(fbp);
                        return 0;
@@ -1767,6 +1837,14 @@ xfs_dir2_node_lookup(
        error = xfs_da_node_lookup_int(state, &rval);
        if (error)
                rval = error;
+        else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+                /* If a CI match, dup the actual name and return EEXIST */
+                xfs_dir2_data_entry_t   *dep;
+                dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp->
+                                                data + state->extrablk.index);
+                rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+        }
        /*
         * Release the btree blocks and leaf block.
         */
@@ -1810,9 +1888,8 @@ xfs_dir2_node_removename(
         * Look up the entry we're deleting, set up the cursor.
         */
        error = xfs_da_node_lookup_int(state, &rval);
-        if (error) {
+        if (error)
                rval = error;
-        }
        /*
         * Didn't find it, upper layer screwed up.
         */
@@ -1829,9 +1906,8 @@ xfs_dir2_node_removename(
         */
        error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
                &state->extrablk, &rval);
-        if (error) {
+        if (error)
                return error;
-        }
        /*
         * Fix the hash values up the btree.
         */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 919d275a1cef..b46af0013ec9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -255,7 +255,7 @@ xfs_dir2_block_to_sf(
        xfs_dir2_sf_check(args);
 out:
        xfs_trans_log_inode(args->trans, dp, logflags);
-        kmem_free(block, mp->m_dirblksize);
+        kmem_free(block);
        return error;
 }
@@ -332,7 +332,7 @@ xfs_dir2_sf_addname(
                /*
                 * Just checking or no space reservation, it doesn't fit.
                 */
-                if (args->justcheck || args->total == 0)
+                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
                        return XFS_ERROR(ENOSPC);
                /*
                 * Convert to block form then add the name.
@@ -345,7 +345,7 @@ xfs_dir2_sf_addname(
        /*
         * Just checking, it fits.
         */
-        if (args->justcheck)
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
                return 0;
        /*
         * Do it the easy way - just add it at the end.
@@ -512,7 +512,7 @@ xfs_dir2_sf_addname_hard(
                sfep = xfs_dir2_sf_nextentry(sfp, sfep);
                memcpy(sfep, oldsfep, old_isize - nbytes);
        }
-        kmem_free(buf, old_isize);
+        kmem_free(buf);
        dp->i_d.di_size = new_isize;
        xfs_dir2_sf_check(args);
 }
@@ -812,8 +812,11 @@ xfs_dir2_sf_lookup(
 {
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     i;              /* entry index */
+        int                     error;
        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        enum xfs_dacmp          cmp;            /* comparison result */
+        xfs_dir2_sf_entry_t     *ci_sfep;       /* case-insens. entry */
        xfs_dir2_trace_args("sf_lookup", args);
        xfs_dir2_sf_check(args);
@@ -836,6 +839,7 @@ xfs_dir2_sf_lookup(
         */
        if (args->namelen == 1 && args->name[0] == '.') {
                args->inumber = dp->i_ino;
+                args->cmpresult = XFS_CMP_EXACT;
                return XFS_ERROR(EEXIST);
        }
        /*
@@ -844,28 +848,41 @@ xfs_dir2_sf_lookup(
        if (args->namelen == 2 &&
            args->name[0] == '.' && args->name[1] == '.') {
                args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+                args->cmpresult = XFS_CMP_EXACT;
                return XFS_ERROR(EEXIST);
        }
        /*
         * Loop over all the entries trying to match ours.
         */
-        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+        ci_sfep = NULL;
-             i < sfp->hdr.count;
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
-             i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+                                i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
-                if (sfep->namelen == args->namelen &&
+                /*
-                    sfep->name[0] == args->name[0] &&
+                 * Compare name and if it's an exact match, return the inode
-                    memcmp(args->name, sfep->name, args->namelen) == 0) {
+                 * number. If it's the first case-insensitive match, store the
-                        args->inumber =
+                 * inode number and continue looking for an exact match.
-                                xfs_dir2_sf_get_inumber(sfp,
+                 */
-                                        xfs_dir2_sf_inumberp(sfep));
+                cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
-                        return XFS_ERROR(EEXIST);
+                                                                sfep->namelen);
+                if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+                        args->cmpresult = cmp;
+                        args->inumber = xfs_dir2_sf_get_inumber(sfp,
+                                                xfs_dir2_sf_inumberp(sfep));
+                        if (cmp == XFS_CMP_EXACT)
+                                return XFS_ERROR(EEXIST);
+                        ci_sfep = sfep;
                }
        }
+        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
        /*
-         * Didn't find it.
+         * Here, we can only be doing a lookup (not a rename or replace).
+         * If a case-insensitive match was not found, return ENOENT.
         */
-        ASSERT(args->oknoent);
+        if (!ci_sfep)
-        return XFS_ERROR(ENOENT);
+                return XFS_ERROR(ENOENT);
+        /* otherwise process the CI match as required by the caller */
+        error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+        return XFS_ERROR(error);
 }
 /*
@@ -904,24 +921,21 @@ xfs_dir2_sf_removename(
         * Loop over the old directory entries.
         * Find the one we're deleting.
         */
-        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
+        for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
-             i < sfp->hdr.count;
+                                i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
-             i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+                if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
-                if (sfep->namelen == args->namelen &&
+                                                                XFS_CMP_EXACT) {
-                    sfep->name[0] == args->name[0] &&
-                    memcmp(sfep->name, args->name, args->namelen) == 0) {
                        ASSERT(xfs_dir2_sf_get_inumber(sfp,
-                                        xfs_dir2_sf_inumberp(sfep)) ==
+                                                xfs_dir2_sf_inumberp(sfep)) ==
-                                args->inumber);
+                                                                args->inumber);
                        break;
                }
        }
        /*
         * Didn't find it.
         */
-        if (i == sfp->hdr.count) {
+        if (i == sfp->hdr.count)
                return XFS_ERROR(ENOENT);
-        }
        /*
         * Calculate sizes.
         */
@@ -1042,11 +1056,10 @@ xfs_dir2_sf_replace(
         */
        else {
                for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-                     i < sfp->hdr.count;
+                                i < sfp->hdr.count;
-                     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+                                i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
-                        if (sfep->namelen == args->namelen &&
+                        if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
-                            sfep->name[0] == args->name[0] &&
+                                                                XFS_CMP_EXACT) {
-                            memcmp(args->name, sfep->name, args->namelen) == 0) {
 #if XFS_BIG_INUMS || defined(DEBUG)
                                ino = xfs_dir2_sf_get_inumber(sfp,
                                        xfs_dir2_sf_inumberp(sfep));
@@ -1061,7 +1074,7 @@ xfs_dir2_sf_replace(
                 * Didn't find it.
                 */
                if (i == sfp->hdr.count) {
-                        ASSERT(args->oknoent);
+                        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 #if XFS_BIG_INUMS
                        if (i8elevated)
                                xfs_dir2_sf_toino4(args);
@@ -1174,7 +1187,7 @@ xfs_dir2_sf_toino4(
        /*
         * Clean up the inode.
         */
-        kmem_free(buf, oldsize);
+        kmem_free(buf);
        dp->i_d.di_size = newsize;
        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
@@ -1251,7 +1264,7 @@ xfs_dir2_sf_toino8(
        /*
         * Clean up the inode.
         */
-        kmem_free(buf, oldsize);
+        kmem_free(buf);
        dp->i_d.di_size = newsize;
        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index 005629d702d2..deecc9d238f8 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -62,7 +62,7 @@ typedef union {
 * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
 * Only need 16 bits, this is the byte offset into the single block form.
 */
-typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
 /*
 * The parent directory has a dedicated field, and the self-pointer must
@@ -76,14 +76,14 @@ typedef struct xfs_dir2_sf_hdr {
        __uint8_t               count;          /* count of entries */
        __uint8_t               i8count;        /* count of 8-byte inode #s */
        xfs_dir2_inou_t         parent;         /* parent dir inode number */
-} xfs_dir2_sf_hdr_t;
+} __arch_pack xfs_dir2_sf_hdr_t;
 typedef struct xfs_dir2_sf_entry {
        __uint8_t               namelen;        /* actual name length */
        xfs_dir2_sf_off_t       offset;         /* saved offset */
        __uint8_t               name[1];        /* name, variable size */
        xfs_dir2_inou_t         inumber;        /* inode number, var. offset */
-} xfs_dir2_sf_entry_t;
+} __arch_pack xfs_dir2_sf_entry_t; 
 typedef struct xfs_dir2_sf {
        xfs_dir2_sf_hdr_t       hdr;            /* shortform header */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
index f3fb2ffd6f5c..6cc7c0c681ac 100644
--- a/fs/xfs/xfs_dir2_trace.c
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -85,7 +85,8 @@ xfs_dir2_trace_args(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck, NULL, NULL);
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+                NULL, NULL);
 }
 void
@@ -100,7 +101,7 @@ xfs_dir2_trace_args_b(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck,
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
                (void *)(bp ? bp->bps[0] : NULL), NULL);
 }
@@ -117,7 +118,7 @@ xfs_dir2_trace_args_bb(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck,
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
                (void *)(lbp ? lbp->bps[0] : NULL),
                (void *)(dbp ? dbp->bps[0] : NULL));
 }
@@ -157,8 +158,8 @@ xfs_dir2_trace_args_db(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck, (void *)(long)db,
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)dbp);
+                (void *)(long)db, (void *)dbp);
 }
 void
@@ -173,7 +174,7 @@ xfs_dir2_trace_args_i(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck,
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
                (void *)((unsigned long)(i >> 32)),
                (void *)((unsigned long)(i & 0xFFFFFFFF)));
 }
@@ -190,7 +191,8 @@ xfs_dir2_trace_args_s(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL);
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+                (void *)(long)s, NULL);
 }
 void
@@ -208,7 +210,7 @@ xfs_dir2_trace_args_sb(
                (void *)((unsigned long)(args->inumber >> 32)),
                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
                (void *)args->dp, (void *)args->trans,
-                (void *)(unsigned long)args->justcheck, (void *)(long)s,
+                (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
-                (void *)dbp);
+                (void *)(long)s, (void *)dbp);
 }
 #endif  /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index f71784ab6a60..2813cdd72375 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DMAPI_H__
 #define __XFS_DMAPI_H__
-#include <linux/version.h>
 /*      Values used to define the on-disk version of dm_attrname_t. All
 *      on-disk attribute names start with the 8-byte string "SGI_DMI_".
 *
@@ -166,6 +165,6 @@ typedef enum {
 #define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
                        DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
+#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
 #endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 05e5365d3c31..f227ecd1a294 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,22 +58,11 @@ xfs_error_trap(int e)
        }
        return e;
 }
-#endif
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
-void
-xfs_error_test_init(void)
-{
-        memset(xfs_etest, 0, sizeof(xfs_etest));
-        memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid));
-        memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname));
-}
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
               int line, char *file, unsigned long randfactor)
@@ -150,8 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                                xfs_etest[i]);
                        xfs_etest[i] = 0;
                        xfs_etest_fsid[i] = 0LL;
-                        kmem_free(xfs_etest_fsname[i],
+                        kmem_free(xfs_etest_fsname[i]);
-                                  strlen(xfs_etest_fsname[i]) + 1);
                        xfs_etest_fsname[i] = NULL;
                }
        }
@@ -163,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
        return 0;
 }
-#endif /* DEBUG || INDUCE_IO_ERROR */
+#endif /* DEBUG */
 static void
 xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
@@ -175,7 +163,7 @@ xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
                newfmt = kmem_alloc(len, KM_SLEEP);
                sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
                icmn_err(level, newfmt, ap);
-                kmem_free(newfmt, len);
+                kmem_free(newfmt);
        } else {
                icmn_err(level, fmt, ap);
        }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 6490d2a9f8e1..11543f10b0c6 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -125,23 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
 #define XFS_RANDOM_DIOWRITE_IOERR                       (XFS_RANDOM_DEFAULT/10)
 #define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+#ifdef DEBUG
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
-extern void xfs_error_test_init(void);
 #define XFS_NUM_INJECT_ERROR                            10
-#ifdef __ANSI_CPP__
-#define XFS_TEST_ERROR(expr, mp, tag, rf)               \
-        ((expr) || \
-         xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
-                         (rf)))
-#else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)               \
        ((expr) || \
         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
                        (rf)))
-#endif /* __ANSI_CPP__ */
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
@@ -149,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
 #define xfs_errortag_add(tag, mp)               (ENOSYS)
 #define xfs_errortag_clearall(mp, loud)         (ENOSYS)
-#endif /* (DEBUG || INDUCE_IO_ERROR) */
+#endif /* DEBUG */
 /*
 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 132bd07b9bb8..8aa28f751b2a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -41,8 +41,7 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
        int nexts = efip->efi_format.efi_nextents;
        if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
-                kmem_free(efip, sizeof(xfs_efi_log_item_t) +
+                kmem_free(efip);
-                                (nexts - 1) * sizeof(xfs_extent_t));
        } else {
                kmem_zone_free(xfs_efi_zone, efip);
        }
@@ -374,8 +373,7 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
        int nexts = efdp->efd_format.efd_nextents;
        if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
-                kmem_free(efdp, sizeof(xfs_efd_log_item_t) +
+                kmem_free(efdp);
-                                (nexts - 1) * sizeof(xfs_extent_t));
        } else {
                kmem_zone_free(xfs_efd_zone, efdp);
        }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3f3785b10804..f3bb75da384e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -397,10 +397,12 @@ int
 xfs_filestream_init(void)
 {
        item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
+        if (!item_zone)
+                return -ENOMEM;
 #ifdef XFS_FILESTREAMS_TRACE
-        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
+        xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
 #endif
-        return item_zone ? 0 : -ENOMEM;
+        return 0;
 }
 /*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 3bed6433d050..01c0cc88d3f3 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LOGV2       0x0100  /* log format version 2 */
 #define XFS_FSOP_GEOM_FLAGS_SECTOR      0x0200  /* sector sizes >1BB    */
 #define XFS_FSOP_GEOM_FLAGS_ATTR2       0x0400  /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI     0x1000  /* ASCII only CI names */
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB      0x4000  /* lazy superblock counters */
@@ -371,6 +372,9 @@ typedef struct xfs_fsop_attrlist_handlereq {
 typedef struct xfs_attr_multiop {
        __u32           am_opcode;
+#define ATTR_OP_GET     1       /* return the indicated attr's value */
+#define ATTR_OP_SET     2       /* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE  3       /* remove the indicated attr */
        __s32           am_error;
        void            __user *am_attrname;
        void            __user *am_attrvalue;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 381ebda4f7bc..84583cf73db3 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -95,6 +95,8 @@ xfs_fs_geometry(
                                XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
                        (xfs_sb_version_hassector(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
+                        (xfs_sb_version_hasasciici(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
                        (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
                        (xfs_sb_version_hasattr2(&mp->m_sb) ?
@@ -625,7 +627,7 @@ xfs_fs_goingdown(
                        xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
                        thaw_bdev(sb->s_bdev, sb);
                }
-        
                break;
        }
        case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index e5310c90e50f..83502f3edef0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -181,7 +181,7 @@ xfs_inobt_delrec(
                 * then we can get rid of this level.
                 */
                if (numrecs == 1 && level > 0) {
-                        agbp = cur->bc_private.i.agbp;
+                        agbp = cur->bc_private.a.agbp;
                        agi = XFS_BUF_TO_AGI(agbp);
                        /*
                         * pp is still set to the first pointer in the block.
@@ -194,7 +194,7 @@ xfs_inobt_delrec(
                         * Free the block.
                         */
                        if ((error = xfs_free_extent(cur->bc_tp,
-                                XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
+                                XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
                                return error;
                        xfs_trans_binval(cur->bc_tp, bp);
                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
@@ -379,7 +379,7 @@ xfs_inobt_delrec(
                rrecs = be16_to_cpu(right->bb_numrecs);
                rbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, lbno, 0, &lbp,
+                                cur->bc_private.a.agno, lbno, 0, &lbp,
                                XFS_INO_BTREE_REF)))
                        return error;
                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -401,7 +401,7 @@ xfs_inobt_delrec(
                lrecs = be16_to_cpu(left->bb_numrecs);
                lbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, rbno, 0, &rbp,
+                                cur->bc_private.a.agno, rbno, 0, &rbp,
                                XFS_INO_BTREE_REF)))
                        return error;
                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -484,7 +484,7 @@ xfs_inobt_delrec(
                xfs_buf_t               *rrbp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0,
+                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
                                &rrbp, XFS_INO_BTREE_REF)))
                        return error;
                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
@@ -497,7 +497,7 @@ xfs_inobt_delrec(
         * Free the deleting block.
         */
        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                     cur->bc_private.i.agno, rbno), 1)))
+                                     cur->bc_private.a.agno, rbno), 1)))
                return error;
        xfs_trans_binval(cur->bc_tp, rbp);
        /*
@@ -854,7 +854,7 @@ xfs_inobt_lookup(
        {
                xfs_agi_t       *agi;   /* a.g. inode header */
-                agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+                agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
                agno = be32_to_cpu(agi->agi_seqno);
                agbno = be32_to_cpu(agi->agi_root);
        }
@@ -1089,7 +1089,7 @@ xfs_inobt_lshift(
         * Set up the left neighbor as "left".
         */
        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib),
+                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
                        0, &lbp, XFS_INO_BTREE_REF)))
                return error;
        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -1207,10 +1207,10 @@ xfs_inobt_newroot(
        /*
         * Get a block & a buffer.
         */
-        agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+        agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
        args.tp = cur->bc_tp;
        args.mp = cur->bc_mp;
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
                be32_to_cpu(agi->agi_root));
        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
                args.isfl = args.userdata = args.minalignslop = 0;
@@ -1233,7 +1233,7 @@ xfs_inobt_newroot(
         */
        agi->agi_root = cpu_to_be32(args.agbno);
        be32_add_cpu(&agi->agi_level, 1);
-        xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+        xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
                XFS_AGI_ROOT | XFS_AGI_LEVEL);
        /*
         * At the previous root level there are now two blocks: the old
@@ -1376,7 +1376,7 @@ xfs_inobt_rshift(
         * Set up the right neighbor as "right".
         */
        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib),
+                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
                        0, &rbp, XFS_INO_BTREE_REF)))
                return error;
        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -1492,7 +1492,7 @@ xfs_inobt_split(
         * Allocate the new block.
         * If we can't do it, we're toast.  Give up.
         */
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
                args.isfl = args.userdata = args.minalignslop = 0;
        args.minlen = args.maxlen = args.prod = 1;
@@ -1725,7 +1725,7 @@ xfs_inobt_decrement(
                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
                                XFS_INO_BTREE_REF)))
                        return error;
                lev--;
@@ -1897,7 +1897,7 @@ xfs_inobt_increment(
                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
                                XFS_INO_BTREE_REF)))
                        return error;
                lev--;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b07604b94d9f..e229e9e001c2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -216,7 +216,14 @@ finish_inode:
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        init_waitqueue_head(&ip->i_ipin_wait);
        atomic_set(&ip->i_pincount, 0);
-        initnsema(&ip->i_flock, 1, "xfsfino");
+        /*
+         * Because we want to use a counting completion, complete
+         * the flush completion once to allow a single access to
+         * the flush completion without blocking.
+         */
+        init_completion(&ip->i_flush);
+        complete(&ip->i_flush);
        if (lock_flags)
                xfs_ilock(ip, lock_flags);
@@ -288,10 +295,17 @@ finish_inode:
        *ipp = ip;
        /*
+         * Set up the Linux with the Linux inode.
+         */
+        ip->i_vnode = inode;
+        inode->i_private = ip;
+        /*
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-        xfs_initialize_vnode(mp, inode, ip);
+        if (ip->i_d.di_mode != 0)
+                xfs_setup_inode(ip);
        return 0;
 }
@@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t	*ip,
 * Special iput for brand-new inodes that are still locked
 */
 void
-xfs_iput_new(xfs_inode_t        *ip,
+xfs_iput_new(
-             uint               lock_flags)
+        xfs_inode_t     *ip,
+        uint            lock_flags)
 {
-        struct inode    *inode = ip->i_vnode;
+        struct inode    *inode = VFS_I(ip);
        xfs_itrace_entry(ip);
@@ -775,26 +790,3 @@ xfs_isilocked(
 }
 #endif
-/*
- * The following three routines simply manage the i_flock
- * semaphore embedded in the inode.  This semaphore synchronizes
- * processes attempting to flush the in-core inode back to disk.
- */
-void
-xfs_iflock(xfs_inode_t *ip)
-{
-        psema(&(ip->i_flock), PINOD|PLTWAIT);
-}
-int
-xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return (cpsema(&(ip->i_flock)));
-}
-void
-xfs_ifunlock(xfs_inode_t *ip)
-{
-        ASSERT(issemalocked(&(ip->i_flock)));
-        vsema(&(ip->i_flock));
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e569bf5d6cf0..00e80df9dd9d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -580,8 +580,8 @@ xfs_iformat_extents(
                xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
                for (i = 0; i < nex; i++, dp++) {
                        xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-                        ep->l0 = be64_to_cpu(get_unaligned(&dp->l0));
+                        ep->l0 = get_unaligned_be64(&dp->l0);
-                        ep->l1 = be64_to_cpu(get_unaligned(&dp->l1));
+                        ep->l1 = get_unaligned_be64(&dp->l1);
                }
                XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
                if (whichfork != XFS_DATA_FORK ||
@@ -835,22 +835,22 @@ xfs_iread(
         * Do this before xfs_iformat in case it adds entries.
         */
 #ifdef  XFS_INODE_TRACE
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP);
+        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMAP_TRACE
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMBT_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
        /*
@@ -1046,9 +1046,9 @@ xfs_ialloc(
 {
        xfs_ino_t       ino;
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp;
        uint            flags;
        int             error;
+        timespec_t      tv;
        /*
         * Call the space management code to pick
@@ -1077,13 +1077,12 @@ xfs_ialloc(
        }
        ASSERT(ip != NULL);
-        vp = XFS_ITOV(ip);
        ip->i_d.di_mode = (__uint16_t)mode;
        ip->i_d.di_onlink = 0;
        ip->i_d.di_nlink = nlink;
        ASSERT(ip->i_d.di_nlink == nlink);
-        ip->i_d.di_uid = current_fsuid(cr);
+        ip->i_d.di_uid = current_fsuid();
-        ip->i_d.di_gid = current_fsgid(cr);
+        ip->i_d.di_gid = current_fsgid();
        ip->i_d.di_projid = prid;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
@@ -1130,7 +1129,13 @@ xfs_ialloc(
        ip->i_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
-        xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+        nanotime(&tv);
+        ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+        ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+        ip->i_d.di_atime = ip->i_d.di_mtime;
+        ip->i_d.di_ctime = ip->i_d.di_mtime;
        /*
         * di_gen will have been taken care of in xfs_iread.
         */
@@ -1220,7 +1225,7 @@ xfs_ialloc(
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
-        xfs_initialize_vnode(tp->t_mountp, vp, ip);
+        xfs_setup_inode(ip);
        *ipp = ip;
        return 0;
@@ -1399,7 +1404,6 @@ xfs_itruncate_start(
        xfs_fsize_t     last_byte;
        xfs_off_t       toss_start;
        xfs_mount_t     *mp;
-        bhv_vnode_t     *vp;
        int             error = 0;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1408,7 +1412,6 @@ xfs_itruncate_start(
               (flags == XFS_ITRUNC_MAYBE));
        mp = ip->i_mount;
-        vp = XFS_ITOV(ip);
        /* wait for the completion of any pending DIOs */
        if (new_size < ip->i_size)
@@ -1457,7 +1460,7 @@ xfs_itruncate_start(
 #ifdef DEBUG
        if (new_size == 0) {
-                ASSERT(VN_CACHED(vp) == 0);
+                ASSERT(VN_CACHED(VFS_I(ip)) == 0);
        }
 #endif
        return error;
@@ -1763,67 +1766,6 @@ xfs_itruncate_finish(
        return 0;
 }
-/*
- * xfs_igrow_start
- *
- * Do the first part of growing a file: zero any data in the last
- * block that is beyond the old EOF.  We need to do this before
- * the inode is joined to the transaction to modify the i_size.
- * That way we can drop the inode lock and call into the buffer
- * cache to get the buffer mapping the EOF.
- */
-int
-xfs_igrow_start(
-        xfs_inode_t     *ip,
-        xfs_fsize_t     new_size,
-        cred_t          *credp)
-{
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-        ASSERT(new_size > ip->i_size);
-        /*
-         * Zero any pages that may have been created by
-         * xfs_write_file() beyond the end of the file
-         * and any blocks between the old and new file sizes.
-         */
-        return xfs_zero_eof(ip, new_size, ip->i_size);
-}
-/*
- * xfs_igrow_finish
- *
- * This routine is called to extend the size of a file.
- * The inode must have both the iolock and the ilock locked
- * for update and it must be a part of the current transaction.
- * The xfs_igrow_start() function must have been called previously.
- * If the change_flag is not zero, the inode change timestamp will
- * be updated.
- */
-void
-xfs_igrow_finish(
-        xfs_trans_t     *tp,
-        xfs_inode_t     *ip,
-        xfs_fsize_t     new_size,
-        int             change_flag)
-{
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-        ASSERT(ip->i_transp == tp);
-        ASSERT(new_size > ip->i_size);
-        /*
-         * Update the file size.  Update the inode change timestamp
-         * if change_flag set.
-         */
-        ip->i_d.di_size = new_size;
-        ip->i_size = new_size;
-        if (change_flag)
-                xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-}
 /*
 * This is called when the inode's link count goes to 0.
 * We place the on-disk inode on a list in the AGI.  It
@@ -2258,7 +2200,7 @@ xfs_ifree_cluster(
                xfs_trans_binval(tp, bp);
        }
-        kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
+        kmem_free(ip_found);
        xfs_put_perag(mp, pag);
 }
@@ -2470,7 +2412,7 @@ xfs_iroot_realloc(
                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
        }
-        kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+        kmem_free(ifp->if_broot);
        ifp->if_broot = new_broot;
        ifp->if_broot_bytes = (int)new_size;
        ASSERT(ifp->if_broot_bytes <=
@@ -2514,7 +2456,7 @@ xfs_idata_realloc(
        if (new_size == 0) {
                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+                        kmem_free(ifp->if_u1.if_data);
                }
                ifp->if_u1.if_data = NULL;
                real_size = 0;
@@ -2529,7 +2471,7 @@ xfs_idata_realloc(
                        ASSERT(ifp->if_real_bytes != 0);
                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
                              new_size);
-                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+                        kmem_free(ifp->if_u1.if_data);
                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
                }
                real_size = 0;
@@ -2636,7 +2578,7 @@ xfs_idestroy_fork(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if (ifp->if_broot != NULL) {
-                kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+                kmem_free(ifp->if_broot);
                ifp->if_broot = NULL;
        }
@@ -2650,7 +2592,7 @@ xfs_idestroy_fork(
                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
                    (ifp->if_u1.if_data != NULL)) {
                        ASSERT(ifp->if_real_bytes != 0);
-                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+                        kmem_free(ifp->if_u1.if_data);
                        ifp->if_u1.if_data = NULL;
                        ifp->if_real_bytes = 0;
                }
@@ -2691,7 +2633,6 @@ xfs_idestroy(
                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
        mrfree(&ip->i_lock);
        mrfree(&ip->i_iolock);
-        freesema(&ip->i_flock);
 #ifdef XFS_INODE_TRACE
        ktrace_free(ip->i_trace);
@@ -3058,7 +2999,7 @@ xfs_iflush_cluster(
 out_free:
        read_unlock(&pag->pag_ici_lock);
-        kmem_free(ilist, ilist_size);
+        kmem_free(ilist);
        return 0;
@@ -3102,17 +3043,17 @@ cluster_corrupt_out:
         * Unlocks the flush lock
         */
        xfs_iflush_abort(iq);
-        kmem_free(ilist, ilist_size);
+        kmem_free(ilist);
        return XFS_ERROR(EFSCORRUPTED);
 }
 /*
 * xfs_iflush() will write a modified inode's changes out to the
 * inode's on disk home.  The caller must have the inode lock held
- * in at least shared mode and the inode flush semaphore must be
+ * in at least shared mode and the inode flush completion must be
- * held as well.  The inode lock will still be held upon return from
+ * active as well.  The inode lock will still be held upon return from
 * the call and the caller is free to unlock it.
- * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The inode flush will be completed when the inode reaches the disk.
 * The flags indicate how the inode's buffer should be written out.
 */
 int
@@ -3131,7 +3072,7 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3143,8 +3084,6 @@ xfs_iflush(
         * flush lock and do nothing.
         */
        if (xfs_inode_clean(ip)) {
-                ASSERT((iip != NULL) ?
-                         !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
                xfs_ifunlock(ip);
                return 0;
        }
@@ -3296,7 +3235,7 @@ xfs_iflush_int(
 #endif
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
               ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3528,7 +3467,6 @@ xfs_iflush_all(
        xfs_mount_t     *mp)
 {
        xfs_inode_t     *ip;
-        bhv_vnode_t     *vp;
 again:
        XFS_MOUNT_ILOCK(mp);
@@ -3543,14 +3481,13 @@ xfs_iflush_all(
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                if (!VFS_I(ip)) {
-                if (!vp) {
                        XFS_MOUNT_IUNLOCK(mp);
                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
                        goto again;
                }
-                ASSERT(vn_count(vp) == 0);
+                ASSERT(vn_count(VFS_I(ip)) == 0);
                ip = ip->i_mnext;
        } while (ip != mp->m_inodes);
@@ -3770,7 +3707,7 @@ xfs_iext_add_indirect_multi(
         * (all extents past */
        if (nex2) {
                byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP);
+                nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
                memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
                erp->er_extcount -= nex2;
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
@@ -3836,7 +3773,7 @@ xfs_iext_add_indirect_multi(
                        erp = xfs_iext_irec_new(ifp, erp_idx);
                }
                memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
-                kmem_free(nex2_ep, byte_diff);
+                kmem_free(nex2_ep);
                erp->er_extcount += nex2;
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
        }
@@ -4070,8 +4007,7 @@ xfs_iext_realloc_direct(
                        ifp->if_u1.if_extents =
                                kmem_realloc(ifp->if_u1.if_extents,
                                                rnew_size,
-                                                ifp->if_real_bytes,
+                                                ifp->if_real_bytes, KM_NOFS);
-                                                KM_SLEEP);
                }
                if (rnew_size > ifp->if_real_bytes) {
                        memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -4112,7 +4048,7 @@ xfs_iext_direct_to_inline(
         */
        memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
                nextents * sizeof(xfs_bmbt_rec_t));
-        kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+        kmem_free(ifp->if_u1.if_extents);
        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
        ifp->if_real_bytes = 0;
 }
@@ -4130,7 +4066,7 @@ xfs_iext_inline_to_direct(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        int             new_size)       /* number of extents in file */
 {
-        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP);
+        ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
        memset(ifp->if_u1.if_extents, 0, new_size);
        if (ifp->if_bytes) {
                memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
@@ -4162,7 +4098,7 @@ xfs_iext_realloc_indirect(
        } else {
                ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
                        kmem_realloc(ifp->if_u1.if_ext_irec,
-                                new_size, size, KM_SLEEP);
+                                new_size, size, KM_NOFS);
        }
 }
@@ -4186,7 +4122,7 @@ xfs_iext_indirect_to_direct(
        ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
        ep = ifp->if_u1.if_ext_irec->er_extbuf;
-        kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t));
+        kmem_free(ifp->if_u1.if_ext_irec);
        ifp->if_flags &= ~XFS_IFEXTIREC;
        ifp->if_u1.if_extents = ep;
        ifp->if_bytes = size;
@@ -4212,7 +4148,7 @@ xfs_iext_destroy(
                }
                ifp->if_flags &= ~XFS_IFEXTIREC;
        } else if (ifp->if_real_bytes) {
-                kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+                kmem_free(ifp->if_u1.if_extents);
        } else if (ifp->if_bytes) {
                memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
                        sizeof(xfs_bmbt_rec_t));
@@ -4404,11 +4340,10 @@ xfs_iext_irec_init(
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        ASSERT(nextents <= XFS_LINEAR_EXTS);
-        erp = (xfs_ext_irec_t *)
+        erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
-                kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
        if (nextents == 0) {
-                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+                ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
        } else if (!ifp->if_real_bytes) {
                xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
        } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
@@ -4456,7 +4391,7 @@ xfs_iext_irec_new(
        /* Initialize new extent record */
        erp = ifp->if_u1.if_ext_irec;
-        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+        erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
        memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
        erp[erp_idx].er_extcount = 0;
@@ -4483,7 +4418,7 @@ xfs_iext_irec_remove(
        if (erp->er_extbuf) {
                xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
                        -erp->er_extcount);
-                kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ);
+                kmem_free(erp->er_extbuf);
        }
        /* Compact extent records */
        erp = ifp->if_u1.if_ext_irec;
@@ -4501,8 +4436,7 @@ xfs_iext_irec_remove(
                xfs_iext_realloc_indirect(ifp,
                        nlists * sizeof(xfs_ext_irec_t));
        } else {
-                kmem_free(ifp->if_u1.if_ext_irec,
+                kmem_free(ifp->if_u1.if_ext_irec);
-                        sizeof(xfs_ext_irec_t));
        }
        ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
 }
@@ -4571,7 +4505,7 @@ xfs_iext_irec_compact_pages(
                         * so er_extoffs don't get modified in
                         * xfs_iext_irec_remove.
                         */
-                        kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ);
+                        kmem_free(erp_next->er_extbuf);
                        erp_next->er_extbuf = NULL;
                        xfs_iext_irec_remove(ifp, erp_idx + 1);
                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
@@ -4596,40 +4530,63 @@ xfs_iext_irec_compact_full(
        int             nlists;                 /* number of irec's (ex lists) */
        ASSERT(ifp->if_flags & XFS_IFEXTIREC);
        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
        erp = ifp->if_u1.if_ext_irec;
        ep = &erp->er_extbuf[erp->er_extcount];
        erp_next = erp + 1;
        ep_next = erp_next->er_extbuf;
        while (erp_idx < nlists - 1) {
+                /*
+                 * Check how many extent records are available in this irec.
+                 * If there is none skip the whole exercise.
+                 */
                ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-                ext_diff = MIN(ext_avail, erp_next->er_extcount);
+                if (ext_avail) {
-                memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
-                erp->er_extcount += ext_diff;
-                erp_next->er_extcount -= ext_diff;
-                /* Remove next page */
-                if (erp_next->er_extcount == 0) {
                        /*
-                         * Free page before removing extent record
+                         * Copy over as many as possible extent records into
-                         * so er_extoffs don't get modified in
+                         * the previous page.
-                         * xfs_iext_irec_remove.
                         */
-                        kmem_free(erp_next->er_extbuf,
+                        ext_diff = MIN(ext_avail, erp_next->er_extcount);
-                                erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
+                        memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
-                        erp_next->er_extbuf = NULL;
+                        erp->er_extcount += ext_diff;
-                        xfs_iext_irec_remove(ifp, erp_idx + 1);
+                        erp_next->er_extcount -= ext_diff;
-                        erp = &ifp->if_u1.if_ext_irec[erp_idx];
-                        nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+                        /*
-                /* Update next page */
+                         * If the next irec is empty now we can simply
-                } else {
+                         * remove it.
-                        /* Move rest of page up to become next new page */
+                         */
-                        memmove(erp_next->er_extbuf, ep_next,
+                        if (erp_next->er_extcount == 0) {
-                                erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
+                                /*
-                        ep_next = erp_next->er_extbuf;
+                                 * Free page before removing extent record
-                        memset(&ep_next[erp_next->er_extcount], 0,
+                                 * so er_extoffs don't get modified in
-                                (XFS_LINEAR_EXTS - erp_next->er_extcount) *
+                                 * xfs_iext_irec_remove.
-                                sizeof(xfs_bmbt_rec_t));
+                                 */
+                                kmem_free(erp_next->er_extbuf);
+                                erp_next->er_extbuf = NULL;
+                                xfs_iext_irec_remove(ifp, erp_idx + 1);
+                                erp = &ifp->if_u1.if_ext_irec[erp_idx];
+                                nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+                        /*
+                         * If the next irec is not empty move up the content
+                         * that has not been copied to the previous page to
+                         * the beggining of this one.
+                         */
+                        } else {
+                                memmove(erp_next->er_extbuf, &ep_next[ext_diff],
+                                        erp_next->er_extcount *
+                                        sizeof(xfs_bmbt_rec_t));
+                                ep_next = erp_next->er_extbuf;
+                                memset(&ep_next[erp_next->er_extcount], 0,
+                                        (XFS_LINEAR_EXTS -
+                                                erp_next->er_extcount) *
+                                        sizeof(xfs_bmbt_rec_t));
+                        }
                }
                if (erp->er_extcount == XFS_LINEAR_EXTS) {
                        erp_idx++;
                        if (erp_idx < nlists)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0a999fee4f03..1420c49674d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -87,8 +87,7 @@ typedef struct xfs_ifork {
 * Flags for xfs_ichgtime().
 */
 #define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define XFS_ICHGTIME_ACC        0x2     /* data fork access timestamp */
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-#define XFS_ICHGTIME_CHG        0x4     /* inode field change timestamp */
 /*
 * Per-fork incore inode flags.
@@ -204,7 +203,7 @@ typedef struct xfs_inode {
        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
        struct list_head        i_reclaim;      /* reclaim list */
-        bhv_vnode_t             *i_vnode;       /* vnode backpointer */
+        struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
@@ -223,7 +222,7 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
-        sema_t                  i_flock;        /* inode flush lock */
+        struct completion       i_flush;        /* inode flush completion q */
        atomic_t                i_pincount;     /* inode pin count */
        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
@@ -263,6 +262,18 @@ typedef struct xfs_inode {
 #define XFS_ISIZE(ip)   (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
                                (ip)->i_size : (ip)->i_d.di_size;
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+        return (struct xfs_inode *)inode->i_private;
+}
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+        return (struct inode *)ip->i_vnode;
+}
 /*
 * i_flags helper functions
 */
@@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define XFS_ITRUNC_DEFINITE     0x1
 #define XFS_ITRUNC_MAYBE        0x2
-#define XFS_ITOV(ip)            ((ip)->i_vnode)
-#define XFS_ITOV_NULL(ip)       ((ip)->i_vnode)
 /*
 * For multiple groups support: if S_ISGID bit is set in the parent
 * directory, group of new file is set to that of the parent, and
@@ -473,11 +481,8 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
 void            xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
-void            xfs_iflock(xfs_inode_t *);
-int             xfs_iflock_nowait(xfs_inode_t *);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_ifunlock(xfs_inode_t *);
 void            xfs_ireclaim(xfs_inode_t *);
 int             xfs_finish_reclaim(xfs_inode_t *, int, int);
 int             xfs_finish_reclaim_all(struct xfs_mount *, int);
@@ -507,9 +512,6 @@ int		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
 int             xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
                                     xfs_fsize_t, int, int);
 int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-int             xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
-void            xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
-                                 xfs_fsize_t, int);
 void            xfs_idestroy_fork(xfs_inode_t *, int);
 void            xfs_idestroy(xfs_inode_t *);
@@ -525,6 +527,7 @@ void		xfs_iflush_all(struct xfs_mount *);
 void            xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
+void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_atime(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
@@ -573,6 +576,26 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
+/*
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+        wait_for_completion(&ip->i_flush);
+}
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+        return try_wait_for_completion(&ip->i_flush);
+}
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+        complete(&ip->i_flush);
+}
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 167b33f15772..97c7452e2620 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -686,7 +686,7 @@ xfs_inode_item_unlock(
                ASSERT(ip->i_d.di_nextents > 0);
                ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
                ASSERT(ip->i_df.if_bytes > 0);
-                kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
+                kmem_free(iip->ili_extents_buf);
                iip->ili_extents_buf = NULL;
        }
        if (iip->ili_aextents_buf != NULL) {
@@ -694,7 +694,7 @@ xfs_inode_item_unlock(
                ASSERT(ip->i_d.di_anextents > 0);
                ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
                ASSERT(ip->i_afp->if_bytes > 0);
-                kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes);
+                kmem_free(iip->ili_aextents_buf);
                iip->ili_aextents_buf = NULL;
        }
@@ -779,11 +779,10 @@ xfs_inode_item_pushbuf(
        ASSERT(iip->ili_push_owner == current_pid());
        /*
-         * If flushlock isn't locked anymore, chances are that the
+         * If a flush is not in progress anymore, chances are that the
-         * inode flush completed and the inode was taken off the AIL.
+         * inode was taken off the AIL. So, just get out.
-         * So, just get out.
         */
-        if (!issemalocked(&(ip->i_flock)) ||
+        if (completion_done(&ip->i_flush) ||
            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
                iip->ili_pushbuf_flag = 0;
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -805,7 +804,7 @@ xfs_inode_item_pushbuf(
                         * If not, we can flush it async.
                         */
                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-                                  issemalocked(&(ip->i_flock)));
+                                  !completion_done(&ip->i_flush));
                        iip->ili_pushbuf_flag = 0;
                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        xfs_buftrace("INODE ITEM PUSH", bp);
@@ -858,7 +857,7 @@ xfs_inode_item_push(
        ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-        ASSERT(issemalocked(&(ip->i_flock)));
+        ASSERT(!completion_done(&ip->i_flush));
        /*
         * Since we were able to lock the inode's flush lock and
         * we found it on the AIL, the inode must be dirty.  This
@@ -957,8 +956,7 @@ xfs_inode_item_destroy(
 {
 #ifdef XFS_TRANS_DEBUG
        if (ip->i_itemp->ili_root_size != 0) {
-                kmem_free(ip->i_itemp->ili_orig_root,
+                kmem_free(ip->i_itemp->ili_orig_root);
-                          ip->i_itemp->ili_root_size);
        }
 #endif
        kmem_zone_free(xfs_ili_zone, ip->i_itemp);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7edcde691d1a..67f22b2b44b3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -889,6 +889,16 @@ xfs_iomap_write_unwritten(
        count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
        count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+        /*
+         * Reserve enough blocks in this transaction for two complete extent
+         * btree splits.  We may be converting the middle part of an unwritten
+         * extent and in this case we will insert two new extents in the btree
+         * each of which could cause a full split.
+         *
+         * This reservation amount will be used in the first call to
+         * xfs_bmbt_split() to select an AG with enough space to satisfy the
+         * rest of the operation.
+         */
        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
        do {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 419de15aeb43..cf6754a3c5b3 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,7 +59,6 @@ xfs_bulkstat_one_iget(
 {
        xfs_icdinode_t  *dic;   /* dinode core info pointer */
        xfs_inode_t     *ip;            /* incore inode pointer */
-        bhv_vnode_t     *vp;
        int             error;
        error = xfs_iget(mp, NULL, ino,
@@ -72,7 +71,6 @@ xfs_bulkstat_one_iget(
        ASSERT(ip != NULL);
        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
-        vp = XFS_ITOV(ip);
        dic = &ip->i_d;
        /* xfs_iget returns the following without needing
@@ -85,7 +83,7 @@ xfs_bulkstat_one_iget(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-        vn_atime_to_bstime(vp, &buf->bs_atime);
+        vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -257,7 +255,7 @@ xfs_bulkstat_one(
                *ubused = error;
 out_free:
-        kmem_free(buf, sizeof(*buf));
+        kmem_free(buf);
        return error;
 }
@@ -708,7 +706,7 @@ xfs_bulkstat(
        /*
         * Done, we're either out of filesystem or space to put the data.
         */
-        kmem_free(irbuf, irbsize);
+        kmem_free(irbuf);
        *ubcountp = ubelem;
        /*
         * Found some inodes, return them now and return the error next time.
@@ -914,7 +912,7 @@ xfs_inumbers(
                }
                *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
        }
-        kmem_free(buffer, bcount * sizeof(*buffer));
+        kmem_free(buffer);
        if (cur)
                xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
                                           XFS_BTREE_NOERROR));
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ad3d26ddfe31..503ea89e8b9a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 STATIC int      xlog_iclogs_empty(xlog_t *log);
 #if defined(XFS_LOG_TRACE)
+#define XLOG_TRACE_LOGGRANT_SIZE        2048
+#define XLOG_TRACE_ICLOG_SIZE           256
+void
+xlog_trace_loggrant_alloc(xlog_t *log)
+{
+        log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
+}
+void
+xlog_trace_loggrant_dealloc(xlog_t *log)
+{
+        ktrace_free(log->l_grant_trace);
+}
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
        unsigned long cnts;
-        if (!log->l_grant_trace) {
-                log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
-                if (!log->l_grant_trace)
-                        return;
-        }
        /* ticket counts are 1 byte each */
        cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 }
 void
+xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
+{
+        iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
+}
+void
+xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
+{
+        ktrace_free(iclog->ic_trace);
+}
+void
 xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 {
-        if (!iclog->ic_trace)
-                iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
        ktrace_enter(iclog->ic_trace,
                     (void *)((unsigned long)state),
                     (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
                     (void *)NULL, (void *)NULL);
 }
 #else
+#define xlog_trace_loggrant_alloc(log)
+#define xlog_trace_loggrant_dealloc(log)
 #define xlog_trace_loggrant(log,tic,string)
+#define xlog_trace_iclog_alloc(iclog)
+#define xlog_trace_iclog_dealloc(iclog)
 #define xlog_trace_iclog(iclog,state)
 #endif /* XFS_LOG_TRACE */
@@ -226,20 +254,24 @@ xlog_grant_sub_space(struct log *log, int bytes)
 static void
 xlog_grant_add_space_write(struct log *log, int bytes)
 {
-        log->l_grant_write_bytes += bytes;
+        int tmp = log->l_logsize - log->l_grant_write_bytes;
-        if (log->l_grant_write_bytes > log->l_logsize) {
+        if (tmp > bytes)
-                log->l_grant_write_bytes -= log->l_logsize;
+                log->l_grant_write_bytes += bytes;
+        else {
                log->l_grant_write_cycle++;
+                log->l_grant_write_bytes = bytes - tmp;
        }
 }
 static void
 xlog_grant_add_space_reserve(struct log *log, int bytes)
 {
-        log->l_grant_reserve_bytes += bytes;
+        int tmp = log->l_logsize - log->l_grant_reserve_bytes;
-        if (log->l_grant_reserve_bytes > log->l_logsize) {
+        if (tmp > bytes)
-                log->l_grant_reserve_bytes -= log->l_logsize;
+                log->l_grant_reserve_bytes += bytes;
+        else {
                log->l_grant_reserve_cycle++;
+                log->l_grant_reserve_bytes = bytes - tmp;
        }
 }
@@ -332,15 +364,12 @@ xfs_log_done(xfs_mount_t	*mp,
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
-        }
+                /* If this ticket was a permanent reservation and we aren't
+                 * trying to release it, reset the inited flags; so next time
-        /* If this ticket was a permanent reservation and we aren't
+                 * we write, a start record will be written out.
-         * trying to release it, reset the inited flags; so next time
+                 */
-         * we write, a start record will be written out.
-         */
-        if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
-            (flags & XFS_LOG_REL_PERM_RESERV) == 0)
                ticket->t_flags |= XLOG_TIC_INITED;
+        }
        return lsn;
 }       /* xfs_log_done */
@@ -353,11 +382,11 @@ xfs_log_done(xfs_mount_t	*mp,
 * Asynchronous forces are implemented by setting the WANT_SYNC
 * bit in the appropriate in-core log and then returning.
 *
- * Synchronous forces are implemented with a semaphore.  All callers
+ * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a semaphore attached to the
+ * to force a given lsn to disk will wait on a the sv attached to the
 * specific in-core log.  When given in-core log finally completes its
 * write to disk, that thread will wake up all threads waiting on the
- * semaphore.
+ * sv.
 */
 int
 _xfs_log_force(
@@ -584,12 +613,12 @@ error:
 * mp           - ubiquitous xfs mount point structure
 */
 int
-xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+xfs_log_mount_finish(xfs_mount_t *mp)
 {
        int     error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                error = xlog_recover_finish(mp->m_log, mfsi_flags);
+                error = xlog_recover_finish(mp->m_log);
        else {
                error = 0;
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -703,7 +732,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
                      iclog->ic_state == XLOG_STATE_DIRTY)) {
                        if (!XLOG_FORCED_SHUTDOWN(log)) {
-                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                sv_wait(&iclog->ic_force_wait, PMEM,
                                        &log->l_icloglock, s);
                        } else {
                                spin_unlock(&log->l_icloglock);
@@ -744,7 +773,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                        || iclog->ic_state == XLOG_STATE_DIRTY
                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                sv_wait(&iclog->ic_force_wait, PMEM,
                                        &log->l_icloglock, s);
                } else {
                        spin_unlock(&log->l_icloglock);
@@ -834,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= tic->t_unit_res;
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_write_headq);
        }
@@ -855,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
                                break;
                        tail_lsn = 0;
                        free_bytes -= need_bytes;
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_reserve_headq);
        }
@@ -1008,7 +1037,7 @@ xlog_iodone(xfs_buf_t *bp)
         * layer, it means the underlyin device no longer supports
         * barrier I/O. Warn loudly and turn off barriers.
         */
-        if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) {
+        if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) {
                l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
                xfs_fs_cmn_err(CE_WARN, l->l_mp,
                                "xlog_iodone: Barriers are no longer supported"
@@ -1228,8 +1257,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        spin_lock_init(&log->l_icloglock);
        spin_lock_init(&log->l_grant_lock);
-        initnsema(&log->l_flushsema, 0, "ic-flush");
+        sv_init(&log->l_flush_wait, 0, "flush_wait");
+        xlog_trace_loggrant_alloc(log);
        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1281,8 +1311,10 @@ xlog_alloc_log(xfs_mount_t	*mp,
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-                sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
+                sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
-                sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
+                sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+                xlog_trace_iclog_alloc(iclog);
                iclogp = &iclog->ic_next;
        }
@@ -1561,33 +1593,21 @@ xlog_dealloc_log(xlog_t *log)
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
-                sv_destroy(&iclog->ic_forcesema);
+                sv_destroy(&iclog->ic_force_wait);
-                sv_destroy(&iclog->ic_writesema);
+                sv_destroy(&iclog->ic_write_wait);
                xfs_buf_free(iclog->ic_bp);
-#ifdef XFS_LOG_TRACE
+                xlog_trace_iclog_dealloc(iclog);
-                if (iclog->ic_trace != NULL) {
-                        ktrace_free(iclog->ic_trace);
-                }
-#endif
                next_iclog = iclog->ic_next;
-                kmem_free(iclog, sizeof(xlog_in_core_t));
+                kmem_free(iclog);
                iclog = next_iclog;
        }
-        freesema(&log->l_flushsema);
        spinlock_destroy(&log->l_icloglock);
        spinlock_destroy(&log->l_grant_lock);
        xfs_buf_free(log->l_xbuf);
-#ifdef XFS_LOG_TRACE
+        xlog_trace_loggrant_dealloc(log);
-        if (log->l_trace != NULL) {
-                ktrace_free(log->l_trace);
-        }
-        if (log->l_grant_trace != NULL) {
-                ktrace_free(log->l_grant_trace);
-        }
-#endif
        log->l_mp->m_log = NULL;
-        kmem_free(log, sizeof(xlog_t));
+        kmem_free(log);
 }       /* xlog_dealloc_log */
 /*
@@ -1973,7 +1993,7 @@ xlog_write(xfs_mount_t *	mp,
 /* Clean iclogs starting from the head.  This ordering must be
 * maintained, so an iclog doesn't become ACTIVE beyond one that
 * is SYNCING.  This is also required to maintain the notion that we use
- * a counting semaphore to hold off would be writers to the log when every
+ * a ordered wait queue to hold off would be writers to the log when every
 * iclog is trying to sync to disk.
 *
 * State Change: DIRTY -> ACTIVE
@@ -2097,6 +2117,7 @@ xlog_state_do_callback(
        int                funcdidcallbacks; /* flag: function did callbacks */
        int                repeats;     /* for issuing console warnings if
                                         * looping too many times */
+        int                wake = 0;
        spin_lock(&log->l_icloglock);
        first_iclog = iclog = log->l_iclog;
@@ -2236,7 +2257,7 @@ xlog_state_do_callback(
                        xlog_state_clean_log(log);
                        /* wake up threads waiting in xfs_log_force() */
-                        sv_broadcast(&iclog->ic_forcesema);
+                        sv_broadcast(&iclog->ic_force_wait);
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
@@ -2278,15 +2299,13 @@ xlog_state_do_callback(
        }
 #endif
-        flushcnt = 0;
+        if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
-        if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
+                wake = 1;
-                flushcnt = log->l_flushcnt;
-                log->l_flushcnt = 0;
-        }
        spin_unlock(&log->l_icloglock);
-        while (flushcnt--)
-                vsema(&log->l_flushsema);
+        if (wake)
-}       /* xlog_state_do_callback */
+                sv_broadcast(&log->l_flush_wait);
+}
 /*
@@ -2300,8 +2319,7 @@ xlog_state_do_callback(
 * the second completion goes through.
 *
 * Callbacks could take time, so they are done outside the scope of the
- * global state machine log lock.  Assume that the calls to cvsema won't
+ * global state machine log lock.
- * take a long time.  At least we know it won't sleep.
 */
 STATIC void
 xlog_state_done_syncing(
@@ -2337,7 +2355,7 @@ xlog_state_done_syncing(
         * iclog buffer, we wake them all, one will get to do the
         * I/O, the others get to wait for the result.
         */
-        sv_broadcast(&iclog->ic_writesema);
+        sv_broadcast(&iclog->ic_write_wait);
        spin_unlock(&log->l_icloglock);
        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
 }       /* xlog_state_done_syncing */
@@ -2345,11 +2363,9 @@ xlog_state_done_syncing(
 /*
 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
- * sleep.  The flush semaphore is set to the number of in-core buffers and
+ * sleep.  We wait on the flush queue on the head iclog as that should be
- * decremented around disk syncing.  Therefore, if all buffers are syncing,
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
- * this semaphore will cause new writes to sleep until a sync completes.
+ * we will wait here and all new writes will sleep until a sync completes.
- * Otherwise, this code just does p() followed by v().  This approximates
- * a sleep/wakeup except we can't race.
 *
 * The in-core logs are used in a circular fashion. They are not used
 * out-of-order even when an iclog past the head is free.
@@ -2384,16 +2400,15 @@ restart:
        }
        iclog = log->l_iclog;
-        if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
+        if (iclog->ic_state != XLOG_STATE_ACTIVE) {
-                log->l_flushcnt++;
-                spin_unlock(&log->l_icloglock);
                xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
                XFS_STATS_INC(xs_log_noiclogs);
-                /* Ensure that log writes happen */
-                psema(&log->l_flushsema, PINOD);
+                /* Wait for log writes to have flushed */
+                sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
                goto restart;
        }
-        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
        head = &iclog->ic_header;
        atomic_inc(&iclog->ic_refcnt);  /* prevents sync */
@@ -2507,7 +2522,7 @@ xlog_grant_log_space(xlog_t	   *log,
                        goto error_return;
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /*
                 * If we got an error, and the filesystem is shutting down,
                 * we'll catch it down below. So just continue...
@@ -2533,7 +2548,7 @@ redo:
                xlog_trace_loggrant(log, tic,
                                    "xlog_grant_log_space: sleep 2");
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                if (XLOG_FORCED_SHUTDOWN(log)) {
                        spin_lock(&log->l_grant_lock);
@@ -2632,7 +2647,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        if (free_bytes < ntic->t_unit_res)
                                break;
                        free_bytes -= ntic->t_unit_res;
-                        sv_signal(&ntic->t_sema);
+                        sv_signal(&ntic->t_wait);
                        ntic = ntic->t_next;
                } while (ntic != log->l_write_headq);
@@ -2643,7 +2658,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
                        xlog_trace_loggrant(log, tic,
                                    "xlog_regrant_write_log_space: sleep 1");
                        XFS_STATS_INC(xs_sleep_logspace);
-                        sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+                        sv_wait(&tic->t_wait, PINOD|PLTWAIT,
                                &log->l_grant_lock, s);
                        /* If we're shutting down, this tic is already
@@ -2672,7 +2687,7 @@ redo:
                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
                        xlog_ins_ticketq(&log->l_write_headq, tic);
                XFS_STATS_INC(xs_sleep_logspace);
-                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
                /* If we're shutting down, this tic is already off the queue */
                if (XLOG_FORCED_SHUTDOWN(log)) {
@@ -2915,7 +2930,7 @@ xlog_state_switch_iclogs(xlog_t		*log,
 *      2. the current iclog is drity, and the previous iclog is in the
 *              active or dirty state.
 *
- * We may sleep (call psema) if:
+ * We may sleep if:
 *
 *      1. the current iclog is not in the active nor dirty state.
 *      2. the current iclog dirty, and the previous iclog is not in the
@@ -3012,7 +3027,7 @@ maybe_sleep:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+                sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3095,7 +3110,7 @@ try_again:
                                                 XLOG_STATE_SYNCING))) {
                        ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
                        XFS_STATS_INC(xs_log_force_sleep);
-                        sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
+                        sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
                                &log->l_icloglock, s);
                        *log_flushed = 1;
                        already_slept = 1;
@@ -3115,7 +3130,7 @@ try_again:
            !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
                /*
-                 * Don't wait on the forcesema if we know that we've
+                 * Don't wait on completion if we know that we've
                 * gotten a log write error.
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR) {
@@ -3123,7 +3138,7 @@ try_again:
                        return XFS_ERROR(EIO);
                }
                XFS_STATS_INC(xs_log_force_sleep);
-                sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+                sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
                /*
                 * No need to grab the log lock here since we're
                 * only deciding whether or not to return EIO
@@ -3179,7 +3194,7 @@ STATIC void
 xlog_ticket_put(xlog_t          *log,
                xlog_ticket_t   *ticket)
 {
-        sv_destroy(&ticket->t_sema);
+        sv_destroy(&ticket->t_wait);
        kmem_zone_free(xfs_log_ticket_zone, ticket);
 }       /* xlog_ticket_put */
@@ -3269,7 +3284,7 @@ xlog_ticket_get(xlog_t		*log,
        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-        sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+        sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
        xlog_tic_reset_res(tic);
@@ -3556,14 +3571,14 @@ xfs_log_force_umount(
         */
        if ((tic = log->l_reserve_headq)) {
                do {
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_reserve_headq);
        }
        if ((tic = log->l_write_headq)) {
                do {
-                        sv_signal(&tic->t_sema);
+                        sv_signal(&tic->t_wait);
                        tic = tic->t_next;
                } while (tic != log->l_write_headq);
        }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d1d678ecb63e..d47b91f10822 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -149,7 +149,7 @@ int	  xfs_log_mount(struct xfs_mount	*mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
                        int                     num_bblocks);
-int       xfs_log_mount_finish(struct xfs_mount *mp, int);
+int       xfs_log_mount_finish(struct xfs_mount *mp);
 void      xfs_log_move_tail(struct xfs_mount    *mp,
                            xfs_lsn_t           tail_lsn);
 int       xfs_log_notify(struct xfs_mount       *mp,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8952a392b5f3..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -241,7 +241,7 @@ typedef struct xlog_res {
 } xlog_res_t;
 typedef struct xlog_ticket {
-        sv_t               t_sema;       /* sleep on this semaphore      : 20 */
+        sv_t               t_wait;       /* ticket wait queue            : 20 */
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
@@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header {
 *      xlog_rec_header_t into the reserved space.
 * - ic_data follows, so a write to disk can start at the beginning of
 *      the iclog.
- * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
 * - ic_next is the pointer to the next iclog in the ring.
 * - ic_bp is a pointer to the buffer used to write this incore log to disk.
 * - ic_log is a pointer back to the global log structure.
@@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header {
 * and move everything else out to subsequent cachelines.
 */
 typedef struct xlog_iclog_fields {
-        sv_t                    ic_forcesema;
+        sv_t                    ic_force_wait;
-        sv_t                    ic_writesema;
+        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
        struct xlog_in_core     *ic_prev;
        struct xfs_buf          *ic_bp;
@@ -377,8 +377,8 @@ typedef struct xlog_in_core {
 /*
 * Defines to save our code from this glop.
 */
-#define ic_forcesema    hic_fields.ic_forcesema
+#define ic_force_wait   hic_fields.ic_force_wait
-#define ic_writesema    hic_fields.ic_writesema
+#define ic_write_wait   hic_fields.ic_write_wait
 #define ic_next         hic_fields.ic_next
 #define ic_prev         hic_fields.ic_prev
 #define ic_bp           hic_fields.ic_bp
@@ -423,10 +423,8 @@ typedef struct log {
        int                     l_logBBsize;    /* size of log in BB chunks */
        /* The following block of fields are changed while holding icloglock */
-        sema_t                  l_flushsema ____cacheline_aligned_in_smp;
+        sv_t                    l_flush_wait ____cacheline_aligned_in_smp;
-                                                /* iclog flushing semaphore */
+                                                /* waiting for iclog flush */
-        int                     l_flushcnt;     /* # of procs waiting on this
-                                                 * sema */
        int                     l_covered_state;/* state of "covering disk
                                                 * log entries" */
        xlog_in_core_t          *l_iclog;       /* head log queue       */
@@ -450,7 +448,6 @@ typedef struct log {
        int                     l_grant_write_bytes;
 #ifdef XFS_LOG_TRACE
-        struct ktrace           *l_trace;
        struct ktrace           *l_grant_trace;
 #endif
@@ -470,7 +467,7 @@ extern int	 xlog_find_tail(xlog_t	*log,
                                xfs_daddr_t *head_blk,
                                xfs_daddr_t *tail_blk);
 extern int       xlog_recover(xlog_t *log);
-extern int       xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern void      xlog_recover_process_iunlinks(xlog_t *log);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e65ab4af0955..82d46ce69d5f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1715,8 +1715,7 @@ xlog_check_buffer_cancelled(
                                        } else {
                                                prevp->bc_next = bcp->bc_next;
                                        }
-                                        kmem_free(bcp,
+                                        kmem_free(bcp);
-                                                  sizeof(xfs_buf_cancel_t));
                                }
                        }
                        return 1;
@@ -2519,7 +2518,7 @@ write_inode_buffer:
 error:
        if (need_free)
-                kmem_free(in_f, sizeof(*in_f));
+                kmem_free(in_f);
        return XFS_ERROR(error);
 }
@@ -2830,16 +2829,14 @@ xlog_recover_free_trans(
                item = item->ri_next;
                 /* Free the regions in the item. */
                for (i = 0; i < free_item->ri_cnt; i++) {
-                        kmem_free(free_item->ri_buf[i].i_addr,
+                        kmem_free(free_item->ri_buf[i].i_addr);
-                                  free_item->ri_buf[i].i_len);
                }
                /* Free the item itself */
-                kmem_free(free_item->ri_buf,
+                kmem_free(free_item->ri_buf);
-                          (free_item->ri_total * sizeof(xfs_log_iovec_t)));
+                kmem_free(free_item);
-                kmem_free(free_item, sizeof(xlog_recover_item_t));
        } while (first_item != item);
        /* Free the transaction recover structure */
-        kmem_free(trans, sizeof(xlog_recover_t));
+        kmem_free(trans);
 }
 STATIC int
@@ -3786,8 +3783,7 @@ xlog_do_log_recovery(
        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
                                      XLOG_RECOVER_PASS1);
        if (error != 0) {
-                kmem_free(log->l_buf_cancel_table,
+                kmem_free(log->l_buf_cancel_table);
-                          XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
                log->l_buf_cancel_table = NULL;
                return error;
        }
@@ -3806,8 +3802,7 @@ xlog_do_log_recovery(
        }
 #endif  /* DEBUG */
-        kmem_free(log->l_buf_cancel_table,
+        kmem_free(log->l_buf_cancel_table);
-                  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
        log->l_buf_cancel_table = NULL;
        return error;
@@ -3945,8 +3940,7 @@ xlog_recover(
 */
 int
 xlog_recover_finish(
-        xlog_t          *log,
+        xlog_t          *log)
-        int             mfsi_flags)
 {
        /*
         * Now we're ready to do the transactions needed for the
@@ -3974,9 +3968,7 @@ xlog_recover_finish(
                xfs_log_force(log->l_mp, (xfs_lsn_t)0,
                              (XFS_LOG_FORCE | XFS_LOG_SYNC));
-                if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
+                xlog_recover_process_iunlinks(log);
-                        xlog_recover_process_iunlinks(log);
-                }
                xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da3988453b71..a4503f5e9497 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,12 +47,10 @@
 STATIC int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int      xfs_uuid_mount(xfs_mount_t *);
-STATIC void     xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void     xfs_unmountfs_wait(xfs_mount_t *);
 #ifdef HAVE_PERCPU_SB
-STATIC void     xfs_icsb_destroy_counters(xfs_mount_t *);
 STATIC void     xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
                                                int);
 STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
@@ -63,7 +61,6 @@ STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 #else
-#define xfs_icsb_destroy_counters(mp)                   do { } while (0)
 #define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
 #define xfs_icsb_modify_counters(mp, a, b, c)           do { } while (0)
@@ -126,34 +123,12 @@ static const struct {
 };
 /*
- * Return a pointer to an initialized xfs_mount structure.
- */
-xfs_mount_t *
-xfs_mount_init(void)
-{
-        xfs_mount_t *mp;
-        mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
-        if (xfs_icsb_init_counters(mp)) {
-                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
-        }
-        spin_lock_init(&mp->m_sb_lock);
-        mutex_init(&mp->m_ilock);
-        mutex_init(&mp->m_growlock);
-        atomic_set(&mp->m_active_trans, 0);
-        return mp;
-}
-/*
 * Free up the resources associated with a mount structure.  Assume that
 * the structure was initially zeroed, so we can tell which fields got
 * initialized.
 */
-void
+STATIC void
-xfs_mount_free(
+xfs_free_perag(
        xfs_mount_t     *mp)
 {
        if (mp->m_perag) {
@@ -161,28 +136,9 @@ xfs_mount_free(
                for (agno = 0; agno < mp->m_maxagi; agno++)
                        if (mp->m_perag[agno].pagb_list)
-                                kmem_free(mp->m_perag[agno].pagb_list,
+                                kmem_free(mp->m_perag[agno].pagb_list);
-                                                sizeof(xfs_perag_busy_t) *
+                kmem_free(mp->m_perag);
-                                                        XFS_PAGB_NUM_SLOTS);
-                kmem_free(mp->m_perag,
-                          sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
        }
-        spinlock_destroy(&mp->m_ail_lock);
-        spinlock_destroy(&mp->m_sb_lock);
-        mutex_destroy(&mp->m_ilock);
-        mutex_destroy(&mp->m_growlock);
-        if (mp->m_quotainfo)
-                XFS_QM_DONE(mp);
-        if (mp->m_fsname != NULL)
-                kmem_free(mp->m_fsname, mp->m_fsname_len);
-        if (mp->m_rtname != NULL)
-                kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1);
-        if (mp->m_logname != NULL)
-                kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
-        xfs_icsb_destroy_counters(mp);
 }
 /*
@@ -288,6 +244,19 @@ xfs_mount_validate_sb(
                return XFS_ERROR(EFSCORRUPTED);
        }
+        /*
+         * Until this is fixed only page-sized or smaller data blocks work.
+         */
+        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+                xfs_fs_mount_cmn_err(flags,
+                        "file system with blocksize %d bytes",
+                        sbp->sb_blocksize);
+                xfs_fs_mount_cmn_err(flags,
+                        "only pagesize (%ld) or less will currently work.",
+                        PAGE_SIZE);
+                return XFS_ERROR(ENOSYS);
+        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
@@ -309,19 +278,6 @@ xfs_mount_validate_sb(
                return XFS_ERROR(ENOSYS);
        }
-        /*
-         * Until this is fixed only page-sized or smaller data blocks work.
-         */
-        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                xfs_fs_mount_cmn_err(flags,
-                        "file system with blocksize %d bytes",
-                        sbp->sb_blocksize);
-                xfs_fs_mount_cmn_err(flags,
-                        "only pagesize (%ld) or less will currently work.",
-                        PAGE_SIZE);
-                return XFS_ERROR(ENOSYS);
-        }
        return 0;
 }
@@ -734,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 * Update alignment values based on mount options and sb values
 */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
-        if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+        if (mp->m_dalign) {
                /*
                 * If stripe unit and stripe width are not multiples
                 * of the fs blocksize turn off alignment.
@@ -894,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
 * Check that the data (and log if separate) are an ok size.
 */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
+xfs_check_sizes(xfs_mount_t *mp)
 {
        xfs_buf_t       *bp;
        xfs_daddr_t     d;
@@ -917,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
                return error;
        }
-        if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
+        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-            mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
@@ -953,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
 */
 int
 xfs_mountfs(
-        xfs_mount_t     *mp,
+        xfs_mount_t     *mp)
-        int             mfsi_flags)
 {
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
        __uint64_t      resblks;
        __int64_t       update_flags = 0LL;
        uint            quotamount, quotaflags;
-        int             agno;
        int             uuid_mounted = 0;
        int             error = 0;
@@ -994,9 +947,19 @@ xfs_mountfs(
                 * Re-check for ATTR2 in case it was found in bad_features2
                 * slot.
                 */
-                if (xfs_sb_version_hasattr2(&mp->m_sb))
+                if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+                   !(mp->m_flags & XFS_MOUNT_NOATTR2))
                        mp->m_flags |= XFS_MOUNT_ATTR2;
+        }
+        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+           (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+                xfs_sb_version_removeattr2(&mp->m_sb);
+                update_flags |= XFS_SB_FEATURES2;
+                /* update sb_versionnum for the clearing of the morebits */
+                if (!sbp->sb_features2)
+                        update_flags |= XFS_SB_VERSIONNUM;
        }
        /*
@@ -1005,7 +968,7 @@ xfs_mountfs(
         * allocator alignment is within an ag, therefore ag has
         * to be aligned at stripe boundary.
         */
-        error = xfs_update_alignment(mp, mfsi_flags, &update_flags);
+        error = xfs_update_alignment(mp, &update_flags);
        if (error)
                goto error1;
@@ -1024,8 +987,7 @@ xfs_mountfs(
         * since a single partition filesystem is identical to a single
         * partition volume/filesystem.
         */
-        if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
+        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
-            (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
                if (xfs_uuid_mount(mp)) {
                        error = XFS_ERROR(EINVAL);
                        goto error1;
@@ -1053,7 +1015,7 @@ xfs_mountfs(
        /*
         * Check that the data (and log if separate) are an ok size.
         */
-        error = xfs_check_sizes(mp, mfsi_flags);
+        error = xfs_check_sizes(mp);
        if (error)
                goto error1;
@@ -1067,13 +1029,6 @@ xfs_mountfs(
        }
        /*
-         * For client case we are done now
-         */
-        if (mfsi_flags & XFS_MFSI_CLIENT) {
-                return 0;
-        }
-        /*
         *  Copies the low order bits of the timestamp and the randomly
         *  set "sequence" number out of a UUID.
         */
@@ -1097,8 +1052,10 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        init_rwsem(&mp->m_peraglock);
-        mp->m_perag =
+        mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
-                kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+                                  KM_MAYFAIL);
+        if (!mp->m_perag)
+                goto error1;
        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
@@ -1210,7 +1167,7 @@ xfs_mountfs(
         * delayed until after the root and real-time bitmap inodes
         * were consistently read in.
         */
-        error = xfs_log_mount_finish(mp, mfsi_flags);
+        error = xfs_log_mount_finish(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: log mount finish failed");
                goto error4;
@@ -1219,7 +1176,7 @@ xfs_mountfs(
        /*
         * Complete the quota initialisation, post-log-replay component.
         */
-        error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags);
+        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
        if (error)
                goto error4;
@@ -1253,31 +1210,25 @@ xfs_mountfs(
 error3:
        xfs_log_unmount_dealloc(mp);
 error2:
-        for (agno = 0; agno < sbp->sb_agcount; agno++)
+        xfs_free_perag(mp);
-                if (mp->m_perag[agno].pagb_list)
-                        kmem_free(mp->m_perag[agno].pagb_list,
-                          sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
-        kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
-        mp->m_perag = NULL;
-        /* FALLTHROUGH */
 error1:
        if (uuid_mounted)
-                xfs_uuid_unmount(mp);
+                uuid_table_remove(&mp->m_sb.sb_uuid);
-        xfs_freesb(mp);
        return error;
 }
 /*
- * xfs_unmountfs
- *
 * This flushes out the inodes,dquots and the superblock, unmounts the
 * log and makes sure that incore structures are freed.
 */
-int
+void
-xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
+xfs_unmountfs(
+        struct xfs_mount        *mp)
 {
-        __uint64_t      resblks;
+        __uint64_t              resblks;
-        int             error = 0;
+        int                     error;
+        IRELE(mp->m_rootip);
        /*
         * We can potentially deadlock here if we have an inode cluster
@@ -1334,32 +1285,20 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-        xfs_freesb(mp);
        /*
         * All inodes from this mount point should be freed.
         */
        ASSERT(mp->m_inodes == NULL);
-        xfs_unmountfs_close(mp, cr);
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
-                xfs_uuid_unmount(mp);
+                uuid_table_remove(&mp->m_sb.sb_uuid);
-#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+#if defined(DEBUG)
        xfs_errortag_clearall(mp, 0);
 #endif
-        xfs_mount_free(mp);
+        xfs_free_perag(mp);
-        return 0;
+        if (mp->m_quotainfo)
-}
+                XFS_QM_DONE(mp);
-void
-xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
-{
-        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
-                xfs_free_buftarg(mp->m_logdev_targp, 1);
-        if (mp->m_rtdev_targp)
-                xfs_free_buftarg(mp->m_rtdev_targp, 1);
-        xfs_free_buftarg(mp->m_ddev_targp, 0);
 }
 STATIC void
@@ -1905,16 +1844,6 @@ xfs_uuid_mount(
 }
 /*
- * Remove filesystem from the UUID table.
- */
-STATIC void
-xfs_uuid_unmount(
-        xfs_mount_t     *mp)
-{
-        uuid_table_remove(&mp->m_sb.sb_uuid);
-}
-/*
 * Used to log changes to the superblock unit and width fields which could
 * be altered by the mount options, as well as any potential sb_features2
 * fixup. Only the first superblock is updated.
@@ -1928,7 +1857,8 @@ xfs_mount_log_sb(
        int             error;
        ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
-                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
+                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
+                         XFS_SB_VERSIONNUM));
        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
@@ -2109,7 +2039,7 @@ xfs_icsb_reinit_counters(
        xfs_icsb_unlock(mp);
 }
-STATIC void
+void
 xfs_icsb_destroy_counters(
        xfs_mount_t     *mp)
 {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 63e0693a358a..f3c1024b1241 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,6 +61,7 @@ struct xfs_bmap_free;
 struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
+struct xfs_nameops;
 /*
 * Prototypes and functions for the Data Migration subsystem.
@@ -113,7 +114,7 @@ struct xfs_dqtrxops;
 struct xfs_quotainfo;
 typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
+typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
 typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
@@ -157,8 +158,8 @@ typedef struct xfs_qmops {
 #define XFS_QM_INIT(mp, mnt, fl) \
        (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
+#define XFS_QM_MOUNT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags)
+        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
 #define XFS_QM_UNMOUNT(mp) \
        (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
 #define XFS_QM_DONE(mp) \
@@ -210,12 +211,14 @@ typedef struct xfs_icsb_cnts {
 extern int      xfs_icsb_init_counters(struct xfs_mount *);
 extern void     xfs_icsb_reinit_counters(struct xfs_mount *);
+extern void     xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void     xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void     xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #else
-#define xfs_icsb_init_counters(mp)      (0)
+#define xfs_icsb_init_counters(mp)              (0)
-#define xfs_icsb_reinit_counters(mp)    do { } while (0)
+#define xfs_icsb_destroy_counters(mp)           do { } while (0)
+#define xfs_icsb_reinit_counters(mp)            do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)       do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
@@ -313,6 +316,7 @@ typedef struct xfs_mount {
        __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
                                                   field governed by m_ilock */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
+        const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
        int                     m_dirblksize;   /* directory block sz--bytes */
        int                     m_dirblkfsbs;   /* directory block sz--fsbs */
        xfs_dablk_t             m_dirdatablk;   /* blockno of dir data v2 */
@@ -378,6 +382,7 @@ typedef struct xfs_mount {
                                                   counters */
 #define XFS_MOUNT_FILESTREAMS   (1ULL << 24)    /* enable the filestreams
                                                   allocator */
+#define XFS_MOUNT_NOATTR2       (1ULL << 25)    /* disable use of attr2 format */
 /*
@@ -437,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 /*
 * Flags for xfs_mountfs
 */
-#define XFS_MFSI_SECOND         0x01    /* Secondary mount -- skip stuff */
-#define XFS_MFSI_CLIENT         0x02    /* Is a client -- skip lots of stuff */
-/*      XFS_MFSI_RRINODES       */
-#define XFS_MFSI_NOUNLINK       0x08    /* Skip unlinked inode processing in */
-                                        /* log recovery */
-#define XFS_MFSI_NO_QUOTACHECK  0x10    /* Skip quotacheck processing */
-/*      XFS_MFSI_CONVERT_SUNIT  */
 #define XFS_MFSI_QUIET          0x40    /* Be silent if mount errors found */
 #define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
@@ -510,15 +508,12 @@ typedef struct xfs_mod_sb {
 #define XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
 #define XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
-extern xfs_mount_t *xfs_mount_init(void);
 extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
-extern void     xfs_mount_free(xfs_mount_t *mp);
+extern int      xfs_mountfs(xfs_mount_t *mp);
-extern int      xfs_mountfs(xfs_mount_t *mp, int);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
-extern int      xfs_unmountfs(xfs_mount_t *, struct cred *);
+extern void     xfs_unmountfs(xfs_mount_t *);
-extern void     xfs_unmountfs_close(xfs_mount_t *, struct cred *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
 extern int      xfs_unmount_flush(xfs_mount_t *, int);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -544,9 +539,6 @@ extern void	xfs_qmops_put(struct xfs_mount *);
 extern struct xfs_dmops xfs_dmcore_xfs;
-extern int      xfs_init(void);
-extern void     xfs_cleanup(void);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index a0b2c0a2589a..afee7eb24323 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -307,15 +307,18 @@ xfs_mru_cache_init(void)
        xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
                                         "xfs_mru_cache_elem");
        if (!xfs_mru_elem_zone)
-                return ENOMEM;
+                goto out;
        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
-        if (!xfs_mru_reap_wq) {
+        if (!xfs_mru_reap_wq)
-                kmem_zone_destroy(xfs_mru_elem_zone);
+                goto out_destroy_mru_elem_zone;
-                return ENOMEM;
-        }
        return 0;
+ out_destroy_mru_elem_zone:
+        kmem_zone_destroy(xfs_mru_elem_zone);
+ out:
+        return -ENOMEM;
 }
 void
@@ -382,9 +385,9 @@ xfs_mru_cache_create(
 exit:
        if (err && mru && mru->lists)
-                kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
+                kmem_free(mru->lists);
        if (err && mru)
-                kmem_free(mru, sizeof(*mru));
+                kmem_free(mru);
        return err;
 }
@@ -424,8 +427,8 @@ xfs_mru_cache_destroy(
        xfs_mru_cache_flush(mru);
-        kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
+        kmem_free(mru->lists);
-        kmem_free(mru, sizeof(*mru));
+        kmem_free(mru);
 }
 /*
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d8063e1ad298..d700dacdb10e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -336,22 +336,18 @@ xfs_rename(
                ASSERT(error != EEXIST);
                if (error)
                        goto abort_return;
-                xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        } else {
-                /*
-                 * We always want to hit the ctime on the source inode.
-                 * We do it in the if clause above for the 'new_parent &&
-                 * src_is_directory' case, and here we get all the other
-                 * cases.  This isn't strictly required by the standards
-                 * since the source inode isn't really being changed,
-                 * but old unix file systems did it and some incremental
-                 * backup programs won't work without it.
-                 */
-                xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
        }
        /*
+         * We always want to hit the ctime on the source inode.
+         *
+         * This isn't strictly required by the standards since the source
+         * inode isn't really being changed, but old unix file systems did
+         * it and some incremental backup programs won't work without it.
+         */
+        xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+        /*
         * Adjust the link count on src_dp.  This is necessary when
         * renaming a directory, either within one parent when
         * the target existed, or across two parent directories.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a0dc6e5bc5b9..e2f68de16159 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
 */
 /*
- * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
- */
-STATIC int
-xfs_lowbit32(
-        __uint32_t      v)
-{
-        if (v)
-                return ffs(v) - 1;
-        return -1;
-}
-/*
 * Allocate space to the bitmap or summary file, and zero it, for growfs.
 */
 STATIC int                              /* error */
@@ -450,6 +438,7 @@ xfs_rtallocate_extent_near(
        }
        bbno = XFS_BITTOBLOCK(mp, bno);
        i = 0;
+        ASSERT(minlen != 0);
        log2len = xfs_highbit32(minlen);
        /*
         * Loop over all bitmap blocks (bbno + i is current block).
@@ -618,6 +607,8 @@ xfs_rtallocate_extent_size(
        xfs_suminfo_t   sum;            /* summary information for extents */
        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+        ASSERT(maxlen != 0);
        /*
         * Loop over all the levels starting with maxlen.
         * At each level, look at all the bitmap blocks, to see if there
@@ -675,6 +666,9 @@ xfs_rtallocate_extent_size(
                *rtblock = NULLRTBLOCK;
                return 0;
        }
+        ASSERT(minlen != 0);
+        ASSERT(maxlen != 0);
        /*
         * Loop over sizes, from maxlen down to minlen.
         * This time, when we do the allocations, allow smaller ones
@@ -1961,6 +1955,7 @@ xfs_growfs_rt(
                                  nsbp->sb_blocksize * nsbp->sb_rextsize);
                nsbp->sb_rextents = nsbp->sb_rblocks;
                do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+                ASSERT(nsbp->sb_rextents != 0);
                nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
                nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
                nrsumsize =
@@ -2062,7 +2057,7 @@ xfs_growfs_rt(
        /*
         * Free the fake mp structure.
         */
-        kmem_free(nmp, sizeof(*nmp));
+        kmem_free(nmp);
        return error;
 }
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index b0f31c09a76d..3a82576dde9a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -314,7 +314,7 @@ xfs_bioerror_relse(
                 * ASYNC buffers.
                 */
                XFS_BUF_ERROR(bp, EIO);
-                XFS_BUF_V_IODONESEMA(bp);
+                XFS_BUF_FINISH_IOWAIT(bp);
        } else {
                xfs_buf_relse(bp);
        }
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index d904efe7f871..3f8cf1587f4c 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -46,10 +46,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION_SECTORBIT        0x0800
 #define XFS_SB_VERSION_EXTFLGBIT        0x1000
 #define XFS_SB_VERSION_DIRV2BIT         0x2000
+#define XFS_SB_VERSION_BORGBIT          0x4000  /* ASCII only case-insens. */
 #define XFS_SB_VERSION_MOREBITSBIT      0x8000
 #define XFS_SB_VERSION_OKSASHFBITS      \
        (XFS_SB_VERSION_EXTFLGBIT | \
-         XFS_SB_VERSION_DIRV2BIT)
+         XFS_SB_VERSION_DIRV2BIT | \
+         XFS_SB_VERSION_BORGBIT)
 #define XFS_SB_VERSION_OKREALFBITS      \
        (XFS_SB_VERSION_ATTRBIT | \
         XFS_SB_VERSION_NLINKBIT | \
@@ -437,6 +439,12 @@ static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
                ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
+static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+{
+        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
@@ -473,6 +481,13 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
                ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
 }
+static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+{
+        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+        if (!sbp->sb_features2)
+                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 140386434aa3..4e1c22a23be5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -43,6 +43,7 @@
 #include "xfs_quota.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
@@ -253,7 +254,7 @@ _xfs_trans_alloc(
        tp->t_mountp = mp;
        tp->t_items_free = XFS_LIC_NUM_SLOTS;
        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
-        XFS_LIC_INIT(&(tp->t_items));
+        xfs_lic_init(&(tp->t_items));
        XFS_LBC_INIT(&(tp->t_busy));
        return tp;
 }
@@ -282,7 +283,7 @@ xfs_trans_dup(
        ntp->t_mountp = tp->t_mountp;
        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
-        XFS_LIC_INIT(&(ntp->t_items));
+        xfs_lic_init(&(ntp->t_items));
        XFS_LBC_INIT(&(ntp->t_busy));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -889,7 +890,7 @@ shut_us_down:
        tp->t_commit_lsn = commit_lsn;
        if (nvec > XFS_TRANS_LOGVEC_COUNT) {
-                kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
+                kmem_free(log_vector);
        }
        /*
@@ -1169,7 +1170,7 @@ xfs_trans_cancel(
                while (licp != NULL) {
                        lidp = licp->lic_descs;
                        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                                if (XFS_LIC_ISFREE(licp, i)) {
+                                if (xfs_lic_isfree(licp, i)) {
                                        continue;
                                }
@@ -1216,6 +1217,68 @@ xfs_trans_free(
        kmem_zone_free(xfs_trans_zone, tp);
 }
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+        struct xfs_trans        **tpp,
+        struct xfs_inode        *dp)
+{
+        struct xfs_trans        *trans;
+        unsigned int            logres, count;
+        int                     error;
+        /*
+         * Ensure that the inode is always logged.
+         */
+        trans = *tpp;
+        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+        /*
+         * Copy the critical parameters from one trans to the next.
+         */
+        logres = trans->t_log_res;
+        count = trans->t_log_count;
+        *tpp = xfs_trans_dup(trans);
+        /*
+         * Commit the current transaction.
+         * If this commit failed, then it'd just unlock those items that
+         * are not marked ihold. That also means that a filesystem shutdown
+         * is in progress. The caller takes the responsibility to cancel
+         * the duplicate transaction that gets returned.
+         */
+        error = xfs_trans_commit(trans, 0);
+        if (error)
+                return (error);
+        trans = *tpp;
+        /*
+         * Reserve space in the log for th next transaction.
+         * This also pushes items in the "AIL", the list of logged items,
+         * out to disk if they are taking up space at the tail of the log
+         * that we want to use.  This requires that either nothing be locked
+         * across this call, or that anything that is locked be logged in
+         * the prior and the next transactions.
+         */
+        error = xfs_trans_reserve(trans, 0, logres, 0,
+                                  XFS_TRANS_PERM_LOG_RES, count);
+        /*
+         *  Ensure that the inode is in the new transaction and locked.
+         */
+        if (error)
+                return error;
+        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(trans, dp);
+        return 0;
+}
 /*
 * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
@@ -1253,7 +1316,7 @@ xfs_trans_committed(
         * Special case the chunk embedded in the transaction.
         */
        licp = &(tp->t_items);
-        if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+        if (!(xfs_lic_are_all_free(licp))) {
                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
        }
@@ -1262,10 +1325,10 @@ xfs_trans_committed(
         */
        licp = licp->lic_next;
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
                next_licp = licp->lic_next;
-                kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                kmem_free(licp);
                licp = next_licp;
        }
@@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed(
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0804207c7391..74c80bd2b0ec 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk {
 * lic_unused to the right value (0 matches all free).  The
 * lic_descs.lid_index values are set up as each desc is allocated.
 */
-#define XFS_LIC_INIT(cp)        xfs_lic_init(cp)
 static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
 {
        cp->lic_free = XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_INIT_SLOT(cp,slot)      xfs_lic_init_slot(cp, slot)
 static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_descs[slot].lid_index = (unsigned char)(slot);
 }
-#define XFS_LIC_VACANCY(cp)             xfs_lic_vacancy(cp)
 static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
 {
        return cp->lic_free & XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_ALL_FREE(cp)            xfs_lic_all_free(cp)
 static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
 {
        cp->lic_free = XFS_LIC_FREEMASK;
 }
-#define XFS_LIC_ARE_ALL_FREE(cp)        xfs_lic_are_all_free(cp)
 static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
 {
        return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
 }
-#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
 static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
 {
        return (cp->lic_free & (1 << slot));
 }
-#define XFS_LIC_CLAIM(cp,slot)          xfs_lic_claim(cp,slot)
 static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_free &= ~(1 << slot);
 }
-#define XFS_LIC_RELSE(cp,slot)          xfs_lic_relse(cp,slot)
 static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
 {
        cp->lic_free |= 1 << slot;
 }
-#define XFS_LIC_SLOT(cp,slot)           xfs_lic_slot(cp,slot)
 static inline xfs_log_item_desc_t *
 xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
 {
        return &(cp->lic_descs[slot]);
 }
-#define XFS_LIC_DESC_TO_SLOT(dp)        xfs_lic_desc_to_slot(dp)
 static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 {
        return (uint)dp->lid_index;
@@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 * All of this yields the address of the chunk, which is
 * cast to a chunk pointer.
 */
-#define XFS_LIC_DESC_TO_CHUNK(dp)       xfs_lic_desc_to_chunk(dp)
 static inline xfs_log_item_chunk_t *
 xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 {
@@ -986,6 +975,7 @@ int		_xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
+int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
 void            xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cb0c5839154b..4e855b5ced66 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match(
        bp = NULL;
        len = BBTOB(len);
        licp = &tp->t_items;
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                for (i = 0; i < licp->lic_unused; i++) {
                        /*
                         * Skip unoccupied slots.
                         */
-                        if (XFS_LIC_ISFREE(licp, i)) {
+                        if (xfs_lic_isfree(licp, i)) {
                                continue;
                        }
-                        lidp = XFS_LIC_SLOT(licp, i);
+                        lidp = xfs_lic_slot(licp, i);
                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
                        if (blip->bli_item.li_type != XFS_LI_BUF) {
                                continue;
@@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all(
        bp = NULL;
        len = BBTOB(len);
        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                if (xfs_lic_are_all_free(licp)) {
                        ASSERT(licp == &tp->t_items);
                        ASSERT(licp->lic_next == NULL);
                        return NULL;
@@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all(
                        /*
                         * Skip unoccupied slots.
                         */
-                        if (XFS_LIC_ISFREE(licp, i)) {
+                        if (xfs_lic_isfree(licp, i)) {
                                continue;
                        }
-                        lidp = XFS_LIC_SLOT(licp, i);
+                        lidp = xfs_lic_slot(licp, i);
                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
                        if (blip->bli_item.li_type != XFS_LI_BUF) {
                                continue;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 4c70bf5e9985..2a1c0f071f91 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -291,7 +291,7 @@ xfs_trans_inode_broot_debug(
        iip = ip->i_itemp;
        if (iip->ili_root_size != 0) {
                ASSERT(iip->ili_orig_root != NULL);
-                kmem_free(iip->ili_orig_root, iip->ili_root_size);
+                kmem_free(iip->ili_orig_root);
                iip->ili_root_size = 0;
                iip->ili_orig_root = NULL;
        }
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 66a09f0d894b..3c666e8317f8 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                 * Initialize the chunk, and then
                 * claim the first slot in the newly allocated chunk.
                 */
-                XFS_LIC_INIT(licp);
+                xfs_lic_init(licp);
-                XFS_LIC_CLAIM(licp, 0);
+                xfs_lic_claim(licp, 0);
                licp->lic_unused = 1;
-                XFS_LIC_INIT_SLOT(licp, 0);
+                xfs_lic_init_slot(licp, 0);
-                lidp = XFS_LIC_SLOT(licp, 0);
+                lidp = xfs_lic_slot(licp, 0);
                /*
                 * Link in the new chunk and update the free count.
@@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
         */
        licp = &tp->t_items;
        while (licp != NULL) {
-                if (XFS_LIC_VACANCY(licp)) {
+                if (xfs_lic_vacancy(licp)) {
                        if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
                                i = licp->lic_unused;
-                                ASSERT(XFS_LIC_ISFREE(licp, i));
+                                ASSERT(xfs_lic_isfree(licp, i));
                                break;
                        }
                        for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-                                if (XFS_LIC_ISFREE(licp, i))
+                                if (xfs_lic_isfree(licp, i))
                                        break;
                        }
                        ASSERT(i <= XFS_LIC_MAX_SLOT);
@@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
         * If we find a free descriptor, claim it,
         * initialize it, and return it.
         */
-        XFS_LIC_CLAIM(licp, i);
+        xfs_lic_claim(licp, i);
        if (licp->lic_unused <= i) {
                licp->lic_unused = i + 1;
-                XFS_LIC_INIT_SLOT(licp, i);
+                xfs_lic_init_slot(licp, i);
        }
-        lidp = XFS_LIC_SLOT(licp, i);
+        lidp = xfs_lic_slot(licp, i);
        tp->t_items_free--;
        lidp->lid_item = lip;
        lidp->lid_flags = 0;
@@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
        xfs_log_item_chunk_t    *licp;
        xfs_log_item_chunk_t    **licpp;
-        slot = XFS_LIC_DESC_TO_SLOT(lidp);
+        slot = xfs_lic_desc_to_slot(lidp);
-        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        licp = xfs_lic_desc_to_chunk(lidp);
-        XFS_LIC_RELSE(licp, slot);
+        xfs_lic_relse(licp, slot);
        lidp->lid_item->li_desc = NULL;
        tp->t_items_free++;
@@ -154,14 +154,14 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
         * Also decrement the transaction structure's count of free items
         * by the number in a chunk since we are freeing an empty chunk.
         */
-        if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+        if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
                licpp = &(tp->t_items.lic_next);
                while (*licpp != licp) {
                        ASSERT(*licpp != NULL);
                        licpp = &((*licpp)->lic_next);
                }
                *licpp = licp->lic_next;
-                kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                kmem_free(licp);
                tp->t_items_free -= XFS_LIC_NUM_SLOTS;
        }
 }
@@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp)
        /*
         * If it's not in the first chunk, skip to the second.
         */
-        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (xfs_lic_are_all_free(licp)) {
                licp = licp->lic_next;
        }
        /*
         * Return the first non-free descriptor in the chunk.
         */
-        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        ASSERT(!xfs_lic_are_all_free(licp));
        for (i = 0; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
        return NULL;
@@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
        xfs_log_item_chunk_t    *licp;
        int                     i;
-        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        licp = xfs_lic_desc_to_chunk(lidp);
        /*
         * First search the rest of the chunk. The for loop keeps us
         * from referencing things beyond the end of the chunk.
         */
-        for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
+        for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        /*
@@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
        }
        licp = licp->lic_next;
-        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        ASSERT(!xfs_lic_are_all_free(licp));
        for (i = 0; i < licp->lic_unused; i++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
-                return XFS_LIC_SLOT(licp, i);
+                return xfs_lic_slot(licp, i);
        }
        ASSERT(0);
        /* NOTREACHED */
@@ -300,9 +300,9 @@ xfs_trans_free_items(
        /*
         * Special case the embedded chunk so we don't free it below.
         */
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
-                XFS_LIC_ALL_FREE(licp);
+                xfs_lic_all_free(licp);
                licp->lic_unused = 0;
        }
        licp = licp->lic_next;
@@ -311,10 +311,10 @@ xfs_trans_free_items(
         * Unlock each item in each chunk and free the chunks.
         */
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
                next_licp = licp->lic_next;
-                kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                kmem_free(licp);
                licp = next_licp;
        }
@@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
        /*
         * Special case the embedded chunk so we don't free.
         */
-        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+        if (!xfs_lic_are_all_free(licp)) {
                freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
        }
        licpp = &(tp->t_items.lic_next);
@@ -358,12 +358,12 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
         * and free empty chunks.
         */
        while (licp != NULL) {
-                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                ASSERT(!xfs_lic_are_all_free(licp));
                freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
                next_licp = licp->lic_next;
-                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                if (xfs_lic_are_all_free(licp)) {
                        *licpp = next_licp;
-                        kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                        kmem_free(licp);
                        freed -= XFS_LIC_NUM_SLOTS;
                } else {
                        licpp = &(licp->lic_next);
@@ -402,7 +402,7 @@ xfs_trans_unlock_chunk(
        freed = 0;
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
-                if (XFS_LIC_ISFREE(licp, i)) {
+                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
                lip = lidp->lid_item;
@@ -421,7 +421,7 @@ xfs_trans_unlock_chunk(
                 */
                if (!(freeing_chunk) &&
                    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-                        XFS_LIC_RELSE(licp, i);
+                        xfs_lic_relse(licp, i);
                        freed++;
                }
        }
@@ -530,7 +530,7 @@ xfs_trans_free_busy(xfs_trans_t *tp)
        lbcp = tp->t_busy.lbc_next;
        while (lbcp != NULL) {
                lbcq = lbcp->lbc_next;
-                kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
+                kmem_free(lbcp);
                lbcp = lbcq;
        }
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 98e5f110ba5f..35d4d414bcc2 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -237,7 +237,7 @@ xfs_droplink(
        ASSERT (ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink--;
-        drop_nlink(ip->i_vnode);
+        drop_nlink(VFS_I(ip));
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        error = 0;
@@ -301,7 +301,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
-        inc_nlink(ip->i_vnode);
+        inc_nlink(VFS_I(ip));
        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f316cb85d8e2..ef321225d269 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,9 +18,6 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
-#define IRELE(ip)       VN_RELE(XFS_ITOV(ip))
-#define IHOLD(ip)       VN_HOLD(XFS_ITOV(ip))
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
                                xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 30bacd8bb0e5..439dd3939dda 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -58,586 +58,6 @@
 #include "xfs_utils.h"
-int __init
-xfs_init(void)
-{
-#ifdef XFS_DABUF_DEBUG
-        extern spinlock_t        xfs_dabuf_global_lock;
-        spin_lock_init(&xfs_dabuf_global_lock);
-#endif
-        /*
-         * Initialize all of the zone allocators we use.
-         */
-        xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
-                                                "xfs_log_ticket");
-        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-                                                "xfs_bmap_free_item");
-        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-                                                "xfs_btree_cur");
-        xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
-                                                "xfs_da_state");
-        xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
-        xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
-        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
-        xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
-        xfs_mru_cache_init();
-        xfs_filestream_init();
-        /*
-         * The size of the zone allocated buf log item is the maximum
-         * size possible under XFS.  This wastes a little bit of memory,
-         * but it is much faster.
-         */
-        xfs_buf_item_zone =
-                kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
-                                  NBWORD) * sizeof(int))),
-                               "xfs_buf_item");
-        xfs_efd_zone =
-                kmem_zone_init((sizeof(xfs_efd_log_item_t) +
-                               ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
-                                 sizeof(xfs_extent_t))),
-                                      "xfs_efd_item");
-        xfs_efi_zone =
-                kmem_zone_init((sizeof(xfs_efi_log_item_t) +
-                               ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
-                                 sizeof(xfs_extent_t))),
-                                      "xfs_efi_item");
-        /*
-         * These zones warrant special memory allocator hints
-         */
-        xfs_inode_zone =
-                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                        KM_ZONE_SPREAD, NULL);
-        xfs_ili_zone =
-                kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
-                                        KM_ZONE_SPREAD, NULL);
-        /*
-         * Allocate global trace buffers.
-         */
-#ifdef XFS_ALLOC_TRACE
-        xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMAP_TRACE
-        xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMBT_TRACE
-        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_ATTR_TRACE
-        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_DIR2_TRACE
-        xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
-#endif
-        xfs_dir_startup();
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
-        xfs_error_test_init();
-#endif /* DEBUG || INDUCE_IO_ERROR */
-        xfs_init_procfs();
-        xfs_sysctl_register();
-        return 0;
-}
-void __exit
-xfs_cleanup(void)
-{
-        extern kmem_zone_t      *xfs_inode_zone;
-        extern kmem_zone_t      *xfs_efd_zone;
-        extern kmem_zone_t      *xfs_efi_zone;
-        xfs_cleanup_procfs();
-        xfs_sysctl_unregister();
-        xfs_filestream_uninit();
-        xfs_mru_cache_uninit();
-        xfs_acl_zone_destroy(xfs_acl_zone);
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(xfs_dir2_trace_buf);
-#endif
-#ifdef XFS_ATTR_TRACE
-        ktrace_free(xfs_attr_trace_buf);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ktrace_free(xfs_bmbt_trace_buf);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(xfs_bmap_trace_buf);
-#endif
-#ifdef XFS_ALLOC_TRACE
-        ktrace_free(xfs_alloc_trace_buf);
-#endif
-        kmem_zone_destroy(xfs_bmap_free_item_zone);
-        kmem_zone_destroy(xfs_btree_cur_zone);
-        kmem_zone_destroy(xfs_inode_zone);
-        kmem_zone_destroy(xfs_trans_zone);
-        kmem_zone_destroy(xfs_da_state_zone);
-        kmem_zone_destroy(xfs_dabuf_zone);
-        kmem_zone_destroy(xfs_buf_item_zone);
-        kmem_zone_destroy(xfs_efd_zone);
-        kmem_zone_destroy(xfs_efi_zone);
-        kmem_zone_destroy(xfs_ifork_zone);
-        kmem_zone_destroy(xfs_ili_zone);
-        kmem_zone_destroy(xfs_log_ticket_zone);
-}
-/*
- * xfs_start_flags
- *
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-        struct xfs_mount_args   *ap,
-        struct xfs_mount        *mp)
-{
-        /* Values are in BBs */
-        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-                /*
-                 * At this point the superblock has not been read
-                 * in, therefore we do not know the block size.
-                 * Before the mount call ends we will convert
-                 * these to FSBs.
-                 */
-                mp->m_dalign = ap->sunit;
-                mp->m_swidth = ap->swidth;
-        }
-        if (ap->logbufs != -1 &&
-            ap->logbufs != 0 &&
-            (ap->logbufs < XLOG_MIN_ICLOGS ||
-             ap->logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
-                        ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-                return XFS_ERROR(EINVAL);
-        }
-        mp->m_logbufs = ap->logbufs;
-        if (ap->logbufsize != -1 &&
-            ap->logbufsize !=  0 &&
-            (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-             ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-             !is_power_of_2(ap->logbufsize))) {
-                cmn_err(CE_WARN,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                        ap->logbufsize);
-                return XFS_ERROR(EINVAL);
-        }
-        mp->m_logbsize = ap->logbufsize;
-        mp->m_fsname_len = strlen(ap->fsname) + 1;
-        mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
-        strcpy(mp->m_fsname, ap->fsname);
-        if (ap->rtname[0]) {
-                mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP);
-                strcpy(mp->m_rtname, ap->rtname);
-        }
-        if (ap->logname[0]) {
-                mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP);
-                strcpy(mp->m_logname, ap->logname);
-        }
-        if (ap->flags & XFSMNT_WSYNC)
-                mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-        if (ap->flags & XFSMNT_INO64) {
-                mp->m_flags |= XFS_MOUNT_INO64;
-                mp->m_inoadd = XFS_INO64_OFFSET;
-        }
-#endif
-        if (ap->flags & XFSMNT_RETERR)
-                mp->m_flags |= XFS_MOUNT_RETERR;
-        if (ap->flags & XFSMNT_NOALIGN)
-                mp->m_flags |= XFS_MOUNT_NOALIGN;
-        if (ap->flags & XFSMNT_SWALLOC)
-                mp->m_flags |= XFS_MOUNT_SWALLOC;
-        if (ap->flags & XFSMNT_OSYNCISOSYNC)
-                mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-        if (ap->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        if (ap->flags & XFSMNT_IOSIZE) {
-                if (ap->iosizelog > XFS_MAX_IO_LOG ||
-                    ap->iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
-                "XFS: invalid log iosize: %d [not %d-%d]",
-                                ap->iosizelog, XFS_MIN_IO_LOG,
-                                XFS_MAX_IO_LOG);
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-        }
-        if (ap->flags & XFSMNT_IKEEP)
-                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if (ap->flags & XFSMNT_DIRSYNC)
-                mp->m_flags |= XFS_MOUNT_DIRSYNC;
-        if (ap->flags & XFSMNT_ATTR2)
-                mp->m_flags |= XFS_MOUNT_ATTR2;
-        if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-                mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-        /*
-         * no recovery flag requires a read-only mount
-         */
-        if (ap->flags & XFSMNT_NORECOVERY) {
-                if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                        cmn_err(CE_WARN,
-        "XFS: tried to mount a FS read-write without recovery!");
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_NORECOVERY;
-        }
-        if (ap->flags & XFSMNT_NOUUID)
-                mp->m_flags |= XFS_MOUNT_NOUUID;
-        if (ap->flags & XFSMNT_BARRIER)
-                mp->m_flags |= XFS_MOUNT_BARRIER;
-        else
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-        if (ap->flags2 & XFSMNT2_FILESTREAMS)
-                mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-        if (ap->flags & XFSMNT_DMAPI)
-                mp->m_flags |= XFS_MOUNT_DMAPI;
-        return 0;
-}
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock _has_ now been read in.
- */
-STATIC int
-xfs_finish_flags(
-        struct xfs_mount_args   *ap,
-        struct xfs_mount        *mp)
-{
-        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-        /* Fail a mount where the logbuf is smaller then the log stripe */
-        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-                if ((ap->logbufsize <= 0) &&
-                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
-                        mp->m_logbsize = mp->m_sb.sb_logsunit;
-                } else if (ap->logbufsize > 0 &&
-                           ap->logbufsize < mp->m_sb.sb_logsunit) {
-                        cmn_err(CE_WARN,
-        "XFS: logbuf size must be greater than or equal to log stripe size");
-                        return XFS_ERROR(EINVAL);
-                }
-        } else {
-                /* Fail a mount if the logbuf is larger than 32K */
-                if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
-                        cmn_err(CE_WARN,
-        "XFS: logbuf size for version 1 logs must be 16K or 32K");
-                        return XFS_ERROR(EINVAL);
-                }
-        }
-        if (xfs_sb_version_hasattr2(&mp->m_sb))
-                mp->m_flags |= XFS_MOUNT_ATTR2;
-        /*
-         * prohibit r/w mounts of read-only filesystems
-         */
-        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-                cmn_err(CE_WARN,
-        "XFS: cannot mount a read-only filesystem as read-write");
-                return XFS_ERROR(EROFS);
-        }
-        /*
-         * check for shared mount.
-         */
-        if (ap->flags & XFSMNT_SHARED) {
-                if (!xfs_sb_version_hasshared(&mp->m_sb))
-                        return XFS_ERROR(EINVAL);
-                /*
-                 * For IRIX 6.5, shared mounts must have the shared
-                 * version bit set, have the persistent readonly
-                 * field set, must be version 0 and can only be mounted
-                 * read-only.
-                 */
-                if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
-                     (mp->m_sb.sb_shared_vn != 0))
-                        return XFS_ERROR(EINVAL);
-                mp->m_flags |= XFS_MOUNT_SHARED;
-                /*
-                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
-                 */
-                if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
-                        return XFS_ERROR(EINVAL);
-        }
-        if (ap->flags & XFSMNT_UQUOTA) {
-                mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_UQUOTAENF)
-                        mp->m_qflags |= XFS_UQUOTA_ENFD;
-        }
-        if (ap->flags & XFSMNT_GQUOTA) {
-                mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_GQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        } else if (ap->flags & XFSMNT_PQUOTA) {
-                mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_PQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        }
-        return 0;
-}
-/*
- * xfs_mount
- *
- * The file system configurations are:
- *      (1) device (partition) with data and internal log
- *      (2) logical volume with data and log subvolumes.
- *      (3) logical volume with data, log, and realtime subvolumes.
- *
- * We only have to handle opening the log and realtime volumes here if
- * they are present.  The data subvolume has already been opened by
- * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev.
- */
-int
-xfs_mount(
-        struct xfs_mount        *mp,
-        struct xfs_mount_args   *args,
-        cred_t                  *credp)
-{
-        struct block_device     *ddev, *logdev, *rtdev;
-        int                     flags = 0, error;
-        ddev = mp->m_super->s_bdev;
-        logdev = rtdev = NULL;
-        error = xfs_dmops_get(mp, args);
-        if (error)
-                return error;
-        error = xfs_qmops_get(mp, args);
-        if (error)
-                return error;
-        if (args->flags & XFSMNT_QUIET)
-                flags |= XFS_MFSI_QUIET;
-        /*
-         * Open real time and log devices - order is important.
-         */
-        if (args->logname[0]) {
-                error = xfs_blkdev_get(mp, args->logname, &logdev);
-                if (error)
-                        return error;
-        }
-        if (args->rtname[0]) {
-                error = xfs_blkdev_get(mp, args->rtname, &rtdev);
-                if (error) {
-                        xfs_blkdev_put(logdev);
-                        return error;
-                }
-                if (rtdev == ddev || rtdev == logdev) {
-                        cmn_err(CE_WARN,
-        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
-                        xfs_blkdev_put(logdev);
-                        xfs_blkdev_put(rtdev);
-                        return EINVAL;
-                }
-        }
-        /*
-         * Setup xfs_mount buffer target pointers
-         */
-        error = ENOMEM;
-        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
-        if (!mp->m_ddev_targp) {
-                xfs_blkdev_put(logdev);
-                xfs_blkdev_put(rtdev);
-                return error;
-        }
-        if (rtdev) {
-                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
-                if (!mp->m_rtdev_targp) {
-                        xfs_blkdev_put(logdev);
-                        xfs_blkdev_put(rtdev);
-                        goto error0;
-                }
-        }
-        mp->m_logdev_targp = (logdev && logdev != ddev) ?
-                                xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp;
-        if (!mp->m_logdev_targp) {
-                xfs_blkdev_put(logdev);
-                xfs_blkdev_put(rtdev);
-                goto error0;
-        }
-        /*
-         * Setup flags based on mount(2) options and then the superblock
-         */
-        error = xfs_start_flags(args, mp);
-        if (error)
-                goto error1;
-        error = xfs_readsb(mp, flags);
-        if (error)
-                goto error1;
-        error = xfs_finish_flags(args, mp);
-        if (error)
-                goto error2;
-        /*
-         * Setup xfs_mount buffer target pointers based on superblock
-         */
-        error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
-                                    mp->m_sb.sb_sectsize);
-        if (!error && logdev && logdev != ddev) {
-                unsigned int    log_sector_size = BBSIZE;
-                if (xfs_sb_version_hassector(&mp->m_sb))
-                        log_sector_size = mp->m_sb.sb_logsectsize;
-                error = xfs_setsize_buftarg(mp->m_logdev_targp,
-                                            mp->m_sb.sb_blocksize,
-                                            log_sector_size);
-        }
-        if (!error && rtdev)
-                error = xfs_setsize_buftarg(mp->m_rtdev_targp,
-                                            mp->m_sb.sb_blocksize,
-                                            mp->m_sb.sb_sectsize);
-        if (error)
-                goto error2;
-        if (mp->m_flags & XFS_MOUNT_BARRIER)
-                xfs_mountfs_check_barriers(mp);
-        if ((error = xfs_filestream_mount(mp)))
-                goto error2;
-        error = xfs_mountfs(mp, flags);
-        if (error)
-                goto error2;
-        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
-        return 0;
-error2:
-        if (mp->m_sb_bp)
-                xfs_freesb(mp);
-error1:
-        xfs_binval(mp->m_ddev_targp);
-        if (logdev && logdev != ddev)
-                xfs_binval(mp->m_logdev_targp);
-        if (rtdev)
-                xfs_binval(mp->m_rtdev_targp);
-error0:
-        xfs_unmountfs_close(mp, credp);
-        xfs_qmops_put(mp);
-        xfs_dmops_put(mp);
-        return error;
-}
-int
-xfs_unmount(
-        xfs_mount_t     *mp,
-        int             flags,
-        cred_t          *credp)
-{
-        xfs_inode_t     *rip;
-        bhv_vnode_t     *rvp;
-        int             unmount_event_wanted = 0;
-        int             unmount_event_flags = 0;
-        int             xfs_unmountfs_needed = 0;
-        int             error;
-        rip = mp->m_rootip;
-        rvp = XFS_ITOV(rip);
-#ifdef HAVE_DMAPI
-        if (mp->m_flags & XFS_MOUNT_DMAPI) {
-                error = XFS_SEND_PREUNMOUNT(mp,
-                                rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
-                                NULL, NULL, 0, 0,
-                                (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
-                                        0:DM_FLAGS_UNWANTED);
-                        if (error)
-                                return XFS_ERROR(error);
-                unmount_event_wanted = 1;
-                unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
-                                        0 : DM_FLAGS_UNWANTED;
-        }
-#endif
-        /*
-         * Blow away any referenced inode in the filestreams cache.
-         * This can and will cause log traffic as inodes go inactive
-         * here.
-         */
-        xfs_filestream_unmount(mp);
-        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        if (error)
-                goto out;
-        ASSERT(vn_count(rvp) == 1);
-        /*
-         * Drop the reference count
-         */
-        IRELE(rip);
-        /*
-         * If we're forcing a shutdown, typically because of a media error,
-         * we want to make sure we invalidate dirty pages that belong to
-         * referenced vnodes as well.
-         */
-        if (XFS_FORCED_SHUTDOWN(mp)) {
-                error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-                ASSERT(error != EFSCORRUPTED);
-        }
-        xfs_unmountfs_needed = 1;
-out:
-        /*      Send DMAPI event, if required.
-         *      Then do xfs_unmountfs() if needed.
-         *      Then return error (or zero).
-         */
-        if (unmount_event_wanted) {
-                /* Note: mp structure must still exist for
-                 * XFS_SEND_UNMOUNT() call.
-                 */
-                XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
-                        DM_RIGHT_NULL, 0, error, unmount_event_flags);
-        }
-        if (xfs_unmountfs_needed) {
-                /*
-                 * Call common unmount function to flush to disk
-                 * and free the super block buffer & mount structures.
-                 */
-                xfs_unmountfs(mp, credp);
-                xfs_qmops_put(mp);
-                xfs_dmops_put(mp);
-                kmem_free(mp, sizeof(xfs_mount_t));
-        }
-        return XFS_ERROR(error);
-}
 STATIC void
 xfs_quiesce_fs(
        xfs_mount_t             *mp)
@@ -694,30 +114,6 @@ xfs_attr_quiesce(
        xfs_unmountfs_writesb(mp);
 }
-int
-xfs_mntupdate(
-        struct xfs_mount                *mp,
-        int                             *flags,
-        struct xfs_mount_args           *args)
-{
-        if (!(*flags & MS_RDONLY)) {                    /* rw/ro -> rw */
-                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        mp->m_flags &= ~XFS_MOUNT_RDONLY;
-                if (args->flags & XFSMNT_BARRIER) {
-                        mp->m_flags |= XFS_MOUNT_BARRIER;
-                        xfs_mountfs_check_barriers(mp);
-                } else {
-                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                }
-        } else if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* rw -> ro */
-                xfs_filestream_flush(mp);
-                xfs_sync(mp, SYNC_DATA_QUIESCE);
-                xfs_attr_quiesce(mp);
-                mp->m_flags |= XFS_MOUNT_RDONLY;
-        }
-        return 0;
-}
 /*
 * xfs_unmount_flush implements a set of flush operation on special
 * inodes, which are needed as a separate set of operations so that
@@ -732,7 +128,6 @@ xfs_unmount_flush(
        xfs_inode_t     *rip = mp->m_rootip;
        xfs_inode_t     *rbmip;
        xfs_inode_t     *rsumip = NULL;
-        bhv_vnode_t     *rvp = XFS_ITOV(rip);
        int             error;
        xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -750,7 +145,7 @@ xfs_unmount_flush(
                if (error == EFSCORRUPTED)
                        goto fscorrupt_out;
-                ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+                ASSERT(vn_count(VFS_I(rbmip)) == 1);
                rsumip = mp->m_rsumip;
                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
@@ -761,7 +156,7 @@ xfs_unmount_flush(
                if (error == EFSCORRUPTED)
                        goto fscorrupt_out;
-                ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+                ASSERT(vn_count(VFS_I(rsumip)) == 1);
        }
        /*
@@ -771,7 +166,7 @@ xfs_unmount_flush(
        if (error == EFSCORRUPTED)
                goto fscorrupt_out2;
-        if (vn_count(rvp) != 1 && !relocation) {
+        if (vn_count(VFS_I(rip)) != 1 && !relocation) {
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                return XFS_ERROR(EBUSY);
        }
@@ -888,7 +283,7 @@ xfs_sync_inodes(
        int             *bypassed)
 {
        xfs_inode_t     *ip = NULL;
-        bhv_vnode_t     *vp = NULL;
+        struct inode    *vp = NULL;
        int             error;
        int             last_error;
        uint64_t        fflag;
@@ -1008,7 +403,7 @@ xfs_sync_inodes(
                        continue;
                }
-                vp = XFS_ITOV_NULL(ip);
+                vp = VFS_I(ip);
                /*
                 * If the vnode is gone then this is being torn down,
@@ -1048,7 +443,7 @@ xfs_sync_inodes(
                if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
                        XFS_MOUNT_IUNLOCK(mp);
-                        kmem_free(ipointer, sizeof(xfs_iptr_t));
+                        kmem_free(ipointer);
                        return 0;
                }
@@ -1083,7 +478,7 @@ xfs_sync_inodes(
                        IPOINTER_INSERT(ip, mp);
                        xfs_ilock(ip, lock_flags);
-                        ASSERT(vp == XFS_ITOV(ip));
+                        ASSERT(vp == VFS_I(ip));
                        ASSERT(ip->i_mount == mp);
                        vnode_refed = B_TRUE;
@@ -1194,7 +589,7 @@ xfs_sync_inodes(
                        }
                        XFS_MOUNT_IUNLOCK(mp);
                        ASSERT(ipointer_in == B_FALSE);
-                        kmem_free(ipointer, sizeof(xfs_iptr_t));
+                        kmem_free(ipointer);
                        return XFS_ERROR(error);
                }
@@ -1224,7 +619,7 @@ xfs_sync_inodes(
        ASSERT(ipointer_in == B_FALSE);
-        kmem_free(ipointer, sizeof(xfs_iptr_t));
+        kmem_free(ipointer);
        return XFS_ERROR(last_error);
 }
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
index 1688817c55ed..a74b05087da4 100644
--- a/fs/xfs/xfs_vfsops.h
+++ b/fs/xfs/xfs_vfsops.h
@@ -8,11 +8,6 @@ struct kstatfs;
 struct xfs_mount;
 struct xfs_mount_args;
-int xfs_mount(struct xfs_mount *mp, struct xfs_mount_args *args,
-                struct cred *credp);
-int xfs_unmount(struct xfs_mount *mp, int flags, struct cred *credp);
-int xfs_mntupdate(struct xfs_mount *mp, int *flags,
-                struct xfs_mount_args *args);
 int xfs_sync(struct xfs_mount *mp, int flags);
 void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
                int lnnum);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e475e3717eb3..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -75,26 +75,23 @@ xfs_open(
        return 0;
 }
-/*
- * xfs_setattr
- */
 int
 xfs_setattr(
-        xfs_inode_t             *ip,
+        struct xfs_inode        *ip,
-        bhv_vattr_t             *vap,
+        struct iattr            *iattr,
        int                     flags,
        cred_t                  *credp)
 {
        xfs_mount_t             *mp = ip->i_mount;
+        struct inode            *inode = VFS_I(ip);
+        int                     mask = iattr->ia_valid;
        xfs_trans_t             *tp;
-        int                     mask;
        int                     code;
        uint                    lock_flags;
        uint                    commit_flags=0;
        uid_t                   uid=0, iuid=0;
        gid_t                   gid=0, igid=0;
        int                     timeflags = 0;
-        xfs_prid_t              projid=0, iprojid=0;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
        int                     file_owner;
        int                     need_iolock = 1;
@@ -104,30 +101,9 @@ xfs_setattr(
        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return XFS_ERROR(EROFS);
-        /*
-         * Cannot set certain attributes.
-         */
-        mask = vap->va_mask;
-        if (mask & XFS_AT_NOSET) {
-                return XFS_ERROR(EINVAL);
-        }
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        /*
-         * Timestamps do not need to be logged and hence do not
-         * need to be done within a transaction.
-         */
-        if (mask & XFS_AT_UPDTIMES) {
-                ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
-                timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
-                            ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
-                            ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
-                xfs_ichgtime(ip, timeflags);
-                return 0;
-        }
        olddquot1 = olddquot2 = NULL;
        udqp = gdqp = NULL;
@@ -139,28 +115,22 @@ xfs_setattr(
         * If the IDs do change before we take the ilock, we're covered
         * because the i_*dquot fields will get updated anyway.
         */
-        if (XFS_IS_QUOTA_ON(mp) &&
+        if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
-            (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
                uint    qflags = 0;
-                if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
+                if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
-                        uid = vap->va_uid;
+                        uid = iattr->ia_uid;
                        qflags |= XFS_QMOPT_UQUOTA;
                } else {
                        uid = ip->i_d.di_uid;
                }
-                if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
+                if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
-                        gid = vap->va_gid;
+                        gid = iattr->ia_gid;
                        qflags |= XFS_QMOPT_GQUOTA;
                }  else {
                        gid = ip->i_d.di_gid;
                }
-                if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
-                        projid = vap->va_projid;
-                        qflags |= XFS_QMOPT_PQUOTA;
-                }  else {
-                        projid = ip->i_d.di_projid;
-                }
                /*
                 * We take a reference when we initialize udqp and gdqp,
                 * so it is important that we never blindly double trip on
@@ -168,8 +138,8 @@ xfs_setattr(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-                code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
+                code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
-                                         &udqp, &gdqp);
+                                         qflags, &udqp, &gdqp);
                if (code)
                        return code;
        }
@@ -180,10 +150,10 @@ xfs_setattr(
         */
        tp = NULL;
        lock_flags = XFS_ILOCK_EXCL;
-        if (flags & ATTR_NOLOCK)
+        if (flags & XFS_ATTR_NOLOCK)
                need_iolock = 0;
-        if (!(mask & XFS_AT_SIZE)) {
+        if (!(mask & ATTR_SIZE)) {
-                if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
+                if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
                    (mp->m_flags & XFS_MOUNT_WSYNC)) {
                        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
                        commit_flags = 0;
@@ -196,10 +166,10 @@ xfs_setattr(
                }
        } else {
                if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
-                    !(flags & ATTR_DMI)) {
+                    !(flags & XFS_ATTR_DMI)) {
                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
-                                vap->va_size, 0, dmflags, NULL);
+                                iattr->ia_size, 0, dmflags, NULL);
                        if (code) {
                                lock_flags = 0;
                                goto error_return;
@@ -212,16 +182,14 @@ xfs_setattr(
        xfs_ilock(ip, lock_flags);
        /* boolean: are we the file owner? */
-        file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
+        file_owner = (current_fsuid() == ip->i_d.di_uid);
        /*
         * Change various properties of a file.
         * Only the owner or users with CAP_FOWNER
         * capability may do these things.
         */
-        if (mask &
+        if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
-            (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
-             XFS_AT_GID|XFS_AT_PROJID)) {
                /*
                 * CAP_FOWNER overrides the following restrictions:
                 *
@@ -245,21 +213,21 @@ xfs_setattr(
                 * IDs of the calling process shall match the group owner of
                 * the file when setting the set-group-ID bit on that file
                 */
-                if (mask & XFS_AT_MODE) {
+                if (mask & ATTR_MODE) {
                        mode_t m = 0;
-                        if ((vap->va_mode & S_ISUID) && !file_owner)
+                        if ((iattr->ia_mode & S_ISUID) && !file_owner)
                                m |= S_ISUID;
-                        if ((vap->va_mode & S_ISGID) &&
+                        if ((iattr->ia_mode & S_ISGID) &&
                            !in_group_p((gid_t)ip->i_d.di_gid))
                                m |= S_ISGID;
 #if 0
                        /* Linux allows this, Irix doesn't. */
-                        if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
+                        if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
                                m |= S_ISVTX;
 #endif
                        if (m && !capable(CAP_FSETID))
-                                vap->va_mode &= ~m;
+                                iattr->ia_mode &= ~m;
                }
        }
@@ -270,7 +238,7 @@ xfs_setattr(
         * and can change the group id only to a group of which he
         * or she is a member.
         */
-        if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
                 * These IDs could have changed since we last looked at them.
                 * But, we're assured that if the ownership did change
@@ -278,12 +246,9 @@ xfs_setattr(
                 * would have changed also.
                 */
                iuid = ip->i_d.di_uid;
-                iprojid = ip->i_d.di_projid;
                igid = ip->i_d.di_gid;
-                gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
+                gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
-                uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
+                uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
-                projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
-                         iprojid;
                /*
                 * CAP_CHOWN overrides the following restrictions:
@@ -303,11 +268,10 @@ xfs_setattr(
                        goto error_return;
                }
                /*
-                 * Do a quota reservation only if uid/projid/gid is actually
+                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
                 */
                if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-                    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
                    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
                        ASSERT(tp);
                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
@@ -321,13 +285,13 @@ xfs_setattr(
        /*
         * Truncate file.  Must have write permission and not be a directory.
         */
-        if (mask & XFS_AT_SIZE) {
+        if (mask & ATTR_SIZE) {
                /* Short circuit the truncate case for zero length files */
-                if ((vap->va_size == 0) &&
+                if (iattr->ia_size == 0 &&
-                   (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
+                    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        if (mask & XFS_AT_CTIME)
+                        if (mask & ATTR_CTIME)
                                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
                        code = 0;
                        goto error_return;
@@ -350,9 +314,9 @@ xfs_setattr(
        /*
         * Change file access or modified times.
         */
-        if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
+        if (mask & (ATTR_ATIME|ATTR_MTIME)) {
                if (!file_owner) {
-                        if ((flags & ATTR_UTIME) &&
+                        if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
                            !capable(CAP_FOWNER)) {
                                code = XFS_ERROR(EPERM);
                                goto error_return;
@@ -361,90 +325,23 @@ xfs_setattr(
        }
        /*
-         * Change extent size or realtime flag.
-         */
-        if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
-                /*
-                 * Can't change extent size if any extents are allocated.
-                 */
-                if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
-                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-                     vap->va_extsize) ) {
-                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
-                        goto error_return;
-                }
-                /*
-                 * Can't change realtime flag if any extents are allocated.
-                 */
-                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-                    (mask & XFS_AT_XFLAGS) &&
-                    (XFS_IS_REALTIME_INODE(ip)) !=
-                    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
-                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
-                        goto error_return;
-                }
-                /*
-                 * Extent size must be a multiple of the appropriate block
-                 * size, if set at all.
-                 */
-                if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
-                        xfs_extlen_t    size;
-                        if (XFS_IS_REALTIME_INODE(ip) ||
-                            ((mask & XFS_AT_XFLAGS) &&
-                            (vap->va_xflags & XFS_XFLAG_REALTIME))) {
-                                size = mp->m_sb.sb_rextsize <<
-                                       mp->m_sb.sb_blocklog;
-                        } else {
-                                size = mp->m_sb.sb_blocksize;
-                        }
-                        if (vap->va_extsize % size) {
-                                code = XFS_ERROR(EINVAL);
-                                goto error_return;
-                        }
-                }
-                /*
-                 * If realtime flag is set then must have realtime data.
-                 */
-                if ((mask & XFS_AT_XFLAGS) &&
-                    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
-                        if ((mp->m_sb.sb_rblocks == 0) ||
-                            (mp->m_sb.sb_rextsize == 0) ||
-                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                                code = XFS_ERROR(EINVAL);
-                                goto error_return;
-                        }
-                }
-                /*
-                 * Can't modify an immutable/append-only file unless
-                 * we have appropriate permission.
-                 */
-                if ((mask & XFS_AT_XFLAGS) &&
-                    (ip->i_d.di_flags &
-                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
-                     (vap->va_xflags &
-                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
-                    !capable(CAP_LINUX_IMMUTABLE)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-        }
-        /*
         * Now we can make the changes.  Before we join the inode
-         * to the transaction, if XFS_AT_SIZE is set then take care of
+         * to the transaction, if ATTR_SIZE is set then take care of
         * the part of the truncation that must be done without the
         * inode lock.  This needs to be done before joining the inode
         * to the transaction, because the inode cannot be unlocked
         * once it is a part of the transaction.
         */
-        if (mask & XFS_AT_SIZE) {
+        if (mask & ATTR_SIZE) {
                code = 0;
-                if ((vap->va_size > ip->i_size) &&
+                if (iattr->ia_size > ip->i_size) {
-                    (flags & ATTR_NOSIZETOK) == 0) {
+                        /*
-                        code = xfs_igrow_start(ip, vap->va_size, credp);
+                         * Do the first part of growing a file: zero any data
+                         * in the last block that is beyond the old EOF.  We
+                         * need to do this before the inode is joined to the
+                         * transaction to modify the i_size.
+                         */
+                        code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
                }
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -461,10 +358,10 @@ xfs_setattr(
                 * not within the range we care about here.
                 */
                if (!code &&
-                    (ip->i_size != ip->i_d.di_size) &&
+                    ip->i_size != ip->i_d.di_size &&
-                    (vap->va_size > ip->i_d.di_size)) {
+                    iattr->ia_size > ip->i_d.di_size) {
                        code = xfs_flush_pages(ip,
-                                        ip->i_d.di_size, vap->va_size,
+                                        ip->i_d.di_size, iattr->ia_size,
                                        XFS_B_ASYNC, FI_NONE);
                }
@@ -472,7 +369,7 @@ xfs_setattr(
                vn_iowait(ip);
                if (!code)
-                        code = xfs_itruncate_data(ip, vap->va_size);
+                        code = xfs_itruncate_data(ip, iattr->ia_size);
                if (code) {
                        ASSERT(tp == NULL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
@@ -501,28 +398,30 @@ xfs_setattr(
        /*
         * Truncate file.  Must have write permission and not be a directory.
         */
-        if (mask & XFS_AT_SIZE) {
+        if (mask & ATTR_SIZE) {
                /*
                 * Only change the c/mtime if we are changing the size
                 * or we are explicitly asked to change it. This handles
                 * the semantic difference between truncate() and ftruncate()
                 * as implemented in the VFS.
                 */
-                if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+                if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
                        timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
-                if (vap->va_size > ip->i_size) {
+                if (iattr->ia_size > ip->i_size) {
-                        xfs_igrow_finish(tp, ip, vap->va_size,
+                        ip->i_d.di_size = iattr->ia_size;
-                            !(flags & ATTR_DMI));
+                        ip->i_size = iattr->ia_size;
-                } else if ((vap->va_size <= ip->i_size) ||
+                        if (!(flags & XFS_ATTR_DMI))
-                           ((vap->va_size == 0) && ip->i_d.di_nextents)) {
+                                xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                } else if (iattr->ia_size <= ip->i_size ||
+                           (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
                        /*
                         * signal a sync transaction unless
                         * we're truncating an already unlinked
                         * file on a wsync filesystem
                         */
-                        code = xfs_itruncate_finish(&tp, ip,
+                        code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
-                                            (xfs_fsize_t)vap->va_size,
                                            XFS_DATA_FORK,
                                            ((ip->i_d.di_nlink != 0 ||
                                              !(mp->m_flags & XFS_MOUNT_WSYNC))
@@ -544,9 +443,12 @@ xfs_setattr(
        /*
         * Change file access modes.
         */
-        if (mask & XFS_AT_MODE) {
+        if (mask & ATTR_MODE) {
                ip->i_d.di_mode &= S_IFMT;
-                ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
+                ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
+                inode->i_mode &= S_IFMT;
+                inode->i_mode |= iattr->ia_mode & ~S_IFMT;
                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
                timeflags |= XFS_ICHGTIME_CHG;
@@ -559,7 +461,7 @@ xfs_setattr(
         * and can change the group id only to a group of which he
         * or she is a member.
         */
-        if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
                 * CAP_FSETID overrides the following restrictions:
                 *
@@ -577,39 +479,24 @@ xfs_setattr(
                 */
                if (iuid != uid) {
                        if (XFS_IS_UQUOTA_ON(mp)) {
-                                ASSERT(mask & XFS_AT_UID);
+                                ASSERT(mask & ATTR_UID);
                                ASSERT(udqp);
                                olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
                                                        &ip->i_udquot, udqp);
                        }
                        ip->i_d.di_uid = uid;
+                        inode->i_uid = uid;
                }
                if (igid != gid) {
                        if (XFS_IS_GQUOTA_ON(mp)) {
                                ASSERT(!XFS_IS_PQUOTA_ON(mp));
-                                ASSERT(mask & XFS_AT_GID);
+                                ASSERT(mask & ATTR_GID);
                                ASSERT(gdqp);
                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
                        ip->i_d.di_gid = gid;
-                }
+                        inode->i_gid = gid;
-                if (iprojid != projid) {
-                        if (XFS_IS_PQUOTA_ON(mp)) {
-                                ASSERT(!XFS_IS_GQUOTA_ON(mp));
-                                ASSERT(mask & XFS_AT_PROJID);
-                                ASSERT(gdqp);
-                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
-                                                        &ip->i_gdquot, gdqp);
-                        }
-                        ip->i_d.di_projid = projid;
-                        /*
-                         * We may have to rev the inode as well as
-                         * the superblock version number since projids didn't
-                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
-                         */
-                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
-                                xfs_bump_ino_vers2(tp, ip);
                }
                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
@@ -620,82 +507,33 @@ xfs_setattr(
        /*
         * Change file access or modified times.
         */
-        if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
+        if (mask & (ATTR_ATIME|ATTR_MTIME)) {
-                if (mask & XFS_AT_ATIME) {
+                if (mask & ATTR_ATIME) {
-                        ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
+                        inode->i_atime = iattr->ia_atime;
-                        ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
+                        ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+                        ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
                        ip->i_update_core = 1;
-                        timeflags &= ~XFS_ICHGTIME_ACC;
                }
-                if (mask & XFS_AT_MTIME) {
+                if (mask & ATTR_MTIME) {
-                        ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
+                        inode->i_mtime = iattr->ia_mtime;
-                        ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
+                        ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+                        ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
                        timeflags &= ~XFS_ICHGTIME_MOD;
                        timeflags |= XFS_ICHGTIME_CHG;
                }
-                if (tp && (flags & ATTR_UTIME))
+                if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
                        xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
        }
        /*
-         * Change XFS-added attributes.
+         * Change file inode change time only if ATTR_CTIME set
-         */
-        if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
-                if (mask & XFS_AT_EXTSIZE) {
-                        /*
-                         * Converting bytes to fs blocks.
-                         */
-                        ip->i_d.di_extsize = vap->va_extsize >>
-                                mp->m_sb.sb_blocklog;
-                }
-                if (mask & XFS_AT_XFLAGS) {
-                        uint    di_flags;
-                        /* can't set PREALLOC this way, just preserve it */
-                        di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
-                        if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
-                                di_flags |= XFS_DIFLAG_IMMUTABLE;
-                        if (vap->va_xflags & XFS_XFLAG_APPEND)
-                                di_flags |= XFS_DIFLAG_APPEND;
-                        if (vap->va_xflags & XFS_XFLAG_SYNC)
-                                di_flags |= XFS_DIFLAG_SYNC;
-                        if (vap->va_xflags & XFS_XFLAG_NOATIME)
-                                di_flags |= XFS_DIFLAG_NOATIME;
-                        if (vap->va_xflags & XFS_XFLAG_NODUMP)
-                                di_flags |= XFS_DIFLAG_NODUMP;
-                        if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
-                                di_flags |= XFS_DIFLAG_PROJINHERIT;
-                        if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
-                                di_flags |= XFS_DIFLAG_NODEFRAG;
-                        if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
-                                di_flags |= XFS_DIFLAG_FILESTREAM;
-                        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
-                                if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
-                                        di_flags |= XFS_DIFLAG_RTINHERIT;
-                                if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
-                                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
-                                if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
-                                        di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-                        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
-                                if (vap->va_xflags & XFS_XFLAG_REALTIME)
-                                        di_flags |= XFS_DIFLAG_REALTIME;
-                                if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
-                                        di_flags |= XFS_DIFLAG_EXTSIZE;
-                        }
-                        ip->i_d.di_flags = di_flags;
-                }
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                timeflags |= XFS_ICHGTIME_CHG;
-        }
-        /*
-         * Change file inode change time only if XFS_AT_CTIME set
         * AND we have been called by a DMI function.
         */
-        if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
+        if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
-                ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
+                inode->i_ctime = iattr->ia_ctime;
-                ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
+                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
                ip->i_update_core = 1;
                timeflags &= ~XFS_ICHGTIME_CHG;
        }
@@ -704,7 +542,7 @@ xfs_setattr(
         * Send out timestamp changes that need to be set to the
         * current time.  Not done when called by a DMI function.
         */
-        if (timeflags && !(flags & ATTR_DMI))
+        if (timeflags && !(flags & XFS_ATTR_DMI))
                xfs_ichgtime(ip, timeflags);
        XFS_STATS_INC(xs_ig_attrchg);
@@ -742,7 +580,7 @@ xfs_setattr(
        }
        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
-            !(flags & ATTR_DMI)) {
+            !(flags & XFS_ATTR_DMI)) {
                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
                                        NULL, DM_RIGHT_NULL, NULL, NULL,
                                        0, 0, AT_DELAY_FLAG(flags));
@@ -875,7 +713,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
        /* capture size updates in I/O completion before writing the inode. */
-        error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+        error = filemap_fdatawait(VFS_I(ip)->i_mapping);
        if (error)
                return XFS_ERROR(error);
@@ -1321,7 +1159,6 @@ int
 xfs_release(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_mount_t     *mp = ip->i_mount;
        int             error;
@@ -1356,13 +1193,13 @@ xfs_release(
                 * be exposed to that problem.
                 */
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-                if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+                if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
                        xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
        }
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
                    (!(ip->i_d.di_flags &
@@ -1388,7 +1225,6 @@ int
 xfs_inactive(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_bmap_free_t free_list;
        xfs_fsblock_t   first_block;
        int             committed;
@@ -1403,7 +1239,7 @@ xfs_inactive(
         * If the inode is already free, then there can be nothing
         * to clean up here.
         */
-        if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
+        if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
                ASSERT(ip->i_df.if_real_bytes == 0);
                ASSERT(ip->i_df.if_broot_bytes == 0);
                return VN_INACTIVE_CACHE;
@@ -1433,7 +1269,7 @@ xfs_inactive(
        if (ip->i_d.di_nlink != 0) {
                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                       ip->i_delayed_blks > 0)) &&
                      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
                     (!(ip->i_d.di_flags &
@@ -1601,12 +1437,18 @@ xfs_inactive(
        return VN_INACTIVE_CACHE;
 }
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
 int
 xfs_lookup(
        xfs_inode_t             *dp,
        struct xfs_name         *name,
-        xfs_inode_t             **ipp)
+        xfs_inode_t             **ipp,
+        struct xfs_name         *ci_name)
 {
        xfs_ino_t               inum;
        int                     error;
@@ -1618,7 +1460,7 @@ xfs_lookup(
                return XFS_ERROR(EIO);
        lock_mode = xfs_ilock_map_shared(dp);
-        error = xfs_dir_lookup(NULL, dp, name, &inum);
+        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
        xfs_iunlock_map_shared(dp, lock_mode);
        if (error)
@@ -1626,12 +1468,15 @@ xfs_lookup(
        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
        if (error)
-                goto out;
+                goto out_free_name;
        xfs_itrace_ref(*ipp);
        return 0;
- out:
+out_free_name:
+        if (ci_name)
+                kmem_free(ci_name->name);
+out:
        *ipp = NULL;
        return error;
 }
@@ -1688,7 +1533,7 @@ xfs_create(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -1860,111 +1705,6 @@ std_return:
 }
 #ifdef DEBUG
-/*
- * Some counters to see if (and how often) we are hitting some deadlock
- * prevention code paths.
- */
-int xfs_rm_locks;
-int xfs_rm_lock_delays;
-int xfs_rm_attempts;
-#endif
-/*
- * The following routine will lock the inodes associated with the
- * directory and the named entry in the directory. The locks are
- * acquired in increasing inode number.
- *
- * If the entry is "..", then only the directory is locked. The
- * vnode ref count will still include that from the .. entry in
- * this case.
- *
- * There is a deadlock we need to worry about. If the locked directory is
- * in the AIL, it might be blocking up the log. The next inode we lock
- * could be already locked by another thread waiting for log space (e.g
- * a permanent log reservation with a long running transaction (see
- * xfs_itruncate_finish)). To solve this, we must check if the directory
- * is in the ail and use lock_nowait. If we can't lock, we need to
- * drop the inode lock on the directory and try again. xfs_iunlock will
- * potentially push the tail if we were holding up the log.
- */
-STATIC int
-xfs_lock_dir_and_entry(
-        xfs_inode_t     *dp,
-        xfs_inode_t     *ip)    /* inode of entry 'name' */
-{
-        int             attempts;
-        xfs_ino_t       e_inum;
-        xfs_inode_t     *ips[2];
-        xfs_log_item_t  *lp;
-#ifdef DEBUG
-        xfs_rm_locks++;
-#endif
-        attempts = 0;
-again:
-        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        e_inum = ip->i_ino;
-        xfs_itrace_ref(ip);
-        /*
-         * We want to lock in increasing inum. Since we've already
-         * acquired the lock on the directory, we may need to release
-         * if if the inum of the entry turns out to be less.
-         */
-        if (e_inum > dp->i_ino) {
-                /*
-                 * We are already in the right order, so just
-                 * lock on the inode of the entry.
-                 * We need to use nowait if dp is in the AIL.
-                 */
-                lp = (xfs_log_item_t *)dp->i_itemp;
-                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                attempts++;
-#ifdef DEBUG
-                                xfs_rm_attempts++;
-#endif
-                                /*
-                                 * Unlock dp and try again.
-                                 * xfs_iunlock will try to push the tail
-                                 * if the inode is in the AIL.
-                                 */
-                                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                                if ((attempts % 5) == 0) {
-                                        delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
-                                        xfs_rm_lock_delays++;
-#endif
-                                }
-                                goto again;
-                        }
-                } else {
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                }
-        } else if (e_inum < dp->i_ino) {
-                xfs_iunlock(dp, XFS_ILOCK_EXCL);
-                ips[0] = ip;
-                ips[1] = dp;
-                xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
-        }
-        /* else  e_inum == dp->i_ino */
-        /*     This can happen if we're asked to lock /x/..
-         *     the entry is "..", which is also the parent directory.
-         */
-        return 0;
-}
-#ifdef DEBUG
 int xfs_locked_n;
 int xfs_small_retries;
 int xfs_middle_retries;
@@ -2098,12 +1838,52 @@ again:
 #endif
 }
-#ifdef  DEBUG
+/*
-#define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
-int remove_which_error_return = 0;
+ * at a time - the iolock or the ilock, but not both at once. If
-#else /* ! DEBUG */
+ * we lock both at once, lockdep will report false positives saying
-#define REMOVE_DEBUG_TRACE(x)
+ * we have violated locking orders.
-#endif  /* ! DEBUG */
+ */
+void
+xfs_lock_two_inodes(
+        xfs_inode_t             *ip0,
+        xfs_inode_t             *ip1,
+        uint                    lock_mode)
+{
+        xfs_inode_t             *temp;
+        int                     attempts = 0;
+        xfs_log_item_t          *lp;
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+                ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+        ASSERT(ip0->i_ino != ip1->i_ino);
+        if (ip0->i_ino > ip1->i_ino) {
+                temp = ip0;
+                ip0 = ip1;
+                ip1 = temp;
+        }
+ again:
+        xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+        /*
+         * If the first lock we have locked is in the AIL, we must TRY to get
+         * the second lock. If we can't get it, we must release the first one
+         * and try again.
+         */
+        lp = (xfs_log_item_t *)ip0->i_itemp;
+        if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+                        xfs_iunlock(ip0, lock_mode);
+                        if ((++attempts % 5) == 0)
+                                delay(1); /* Don't just spin the CPU */
+                        goto again;
+                }
+        } else {
+                xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+        }
+}
 int
 xfs_remove(
@@ -2113,6 +1893,7 @@ xfs_remove(
 {
        xfs_mount_t             *mp = dp->i_mount;
        xfs_trans_t             *tp = NULL;
+        int                     is_dir = S_ISDIR(ip->i_d.di_mode);
        int                     error = 0;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
@@ -2120,8 +1901,10 @@ xfs_remove(
        int                     committed;
        int                     link_zero;
        uint                    resblks;
+        uint                    log_count;
        xfs_itrace_entry(dp);
+        xfs_itrace_entry(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
@@ -2134,19 +1917,23 @@ xfs_remove(
                        return error;
        }
-        xfs_itrace_entry(ip);
-        xfs_itrace_ref(ip);
        error = XFS_QM_DQATTACH(mp, dp, 0);
-        if (!error)
+        if (error)
-                error = XFS_QM_DQATTACH(mp, ip, 0);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
                goto std_return;
-        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+        error = XFS_QM_DQATTACH(mp, ip, 0);
+        if (error)
+                goto std_return;
+        if (is_dir) {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+                log_count = XFS_DEFAULT_LOG_COUNT;
+        } else {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+                log_count = XFS_REMOVE_LOG_COUNT;
+        }
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        /*
         * We try to get the real space reservation first,
         * allowing for directory btree deletion(s) implying
@@ -2158,25 +1945,19 @@ xfs_remove(
         */
        resblks = XFS_REMOVE_SPACE_RES(mp);
        error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
-                        XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+                                  XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
                resblks = 0;
                error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
-                                XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+                                          XFS_TRANS_PERM_LOG_RES, log_count);
        }
        if (error) {
                ASSERT(error != ENOSPC);
-                REMOVE_DEBUG_TRACE(__LINE__);
+                cancel_flags = 0;
-                xfs_trans_cancel(tp, 0);
+                goto out_trans_cancel;
-                return error;
        }
-        error = xfs_lock_dir_and_entry(dp, ip);
+        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
-                xfs_trans_cancel(tp, cancel_flags);
-                goto std_return;
-        }
        /*
         * At this point, we've gotten both the directory and the entry
@@ -2189,46 +1970,83 @@ xfs_remove(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        /*
-         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
+         * If we're removing a directory perform some additional validation.
         */
+        if (is_dir) {
+                ASSERT(ip->i_d.di_nlink >= 2);
+                if (ip->i_d.di_nlink != 2) {
+                        error = XFS_ERROR(ENOTEMPTY);
+                        goto out_trans_cancel;
+                }
+                if (!xfs_dir_isempty(ip)) {
+                        error = XFS_ERROR(ENOTEMPTY);
+                        goto out_trans_cancel;
+                }
+        }
        XFS_BMAP_INIT(&free_list, &first_block);
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error) {
                ASSERT(error != ENOENT);
-                REMOVE_DEBUG_TRACE(__LINE__);
+                goto out_bmap_cancel;
-                goto error1;
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        /*
+         * Bump the in memory generation count on the parent
+         * directory so that other can know that it has changed.
+         */
        dp->i_gen++;
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-        error = xfs_droplink(tp, ip);
+        if (is_dir) {
-        if (error) {
+                /*
-                REMOVE_DEBUG_TRACE(__LINE__);
+                 * Drop the link from ip's "..".
-                goto error1;
+                 */
+                error = xfs_droplink(tp, dp);
+                if (error)
+                        goto out_bmap_cancel;
+                /*
+                 * Drop the link from dp to ip.
+                 */
+                error = xfs_droplink(tp, ip);
+                if (error)
+                        goto out_bmap_cancel;
+        } else {
+                /*
+                 * When removing a non-directory we need to log the parent
+                 * inode here for the i_gen update.  For a directory this is
+                 * done implicitly by the xfs_droplink call for the ".." entry.
+                 */
+                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        }
-        /* Determine if this is the last link while
+        /*
+         * Drop the "." link from ip to self.
+         */
+        error = xfs_droplink(tp, ip);
+        if (error)
+                goto out_bmap_cancel;
+        /*
+         * Determine if this is the last link while
         * we are in the transaction.
         */
-        link_zero = (ip)->i_d.di_nlink==0;
+        link_zero = (ip->i_d.di_nlink == 0);
        /*
         * If this is a synchronous mount, make sure that the
         * remove transaction goes to disk before returning to
         * the user.
         */
-        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
                xfs_trans_set_sync(tp);
-        }
        error = xfs_bmap_finish(&tp, &free_list, &committed);
-        if (error) {
+        if (error)
-                REMOVE_DEBUG_TRACE(__LINE__);
+                goto out_bmap_cancel;
-                goto error_rele;
-        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        if (error)
@@ -2240,38 +2058,26 @@ xfs_remove(
         * will get killed on last close in xfs_close() so we don't
         * have to worry about that.
         */
-        if (link_zero && xfs_inode_is_filestream(ip))
+        if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
                xfs_filestream_deassociate(ip);
        xfs_itrace_exit(ip);
+        xfs_itrace_exit(dp);
-/*      Fall through to std_return with error = 0 */
 std_return:
        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
+                XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
-                                dp, DM_RIGHT_NULL,
+                                NULL, DM_RIGHT_NULL, name->name, NULL,
-                                NULL, DM_RIGHT_NULL,
+                                ip->i_d.di_mode, error, 0);
-                                name->name, NULL, ip->i_d.di_mode, error, 0);
        }
-        return error;
- error1:
+        return error;
-        xfs_bmap_cancel(&free_list);
-        cancel_flags |= XFS_TRANS_ABORT;
-        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
- error_rele:
+ out_bmap_cancel:
-        /*
-         * In this case make sure to not release the inode until after
-         * the current transaction is aborted.  Releasing it beforehand
-         * can cause us to go to xfs_inactive and start a recursive
-         * transaction which can easily deadlock with the current one.
-         */
        xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
        goto std_return;
 }
@@ -2283,7 +2089,6 @@ xfs_link(
 {
        xfs_mount_t             *mp = tdp->i_mount;
        xfs_trans_t             *tp;
-        xfs_inode_t             *ips[2];
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
@@ -2331,15 +2136,7 @@ xfs_link(
                goto error_return;
        }
-        if (sip->i_ino < tdp->i_ino) {
+        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
-                ips[0] = sip;
-                ips[1] = tdp;
-        } else {
-                ips[0] = tdp;
-                ips[1] = sip;
-        }
-        xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
        /*
         * Increment vnode ref counts since xfs_trans_commit &
@@ -2480,7 +2277,7 @@ xfs_mkdir(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2638,186 +2435,6 @@ std_return:
 }
 int
-xfs_rmdir(
-        xfs_inode_t             *dp,
-        struct xfs_name         *name,
-        xfs_inode_t             *cdp)
-{
-        xfs_mount_t             *mp = dp->i_mount;
-        xfs_trans_t             *tp;
-        int                     error;
-        xfs_bmap_free_t         free_list;
-        xfs_fsblock_t           first_block;
-        int                     cancel_flags;
-        int                     committed;
-        int                     last_cdp_link;
-        uint                    resblks;
-        xfs_itrace_entry(dp);
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
-                                        dp, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL, name->name,
-                                        NULL, cdp->i_d.di_mode, 0, 0);
-                if (error)
-                        return XFS_ERROR(error);
-        }
-        /*
-         * Get the dquots for the inodes.
-         */
-        error = XFS_QM_DQATTACH(mp, dp, 0);
-        if (!error)
-                error = XFS_QM_DQATTACH(mp, cdp, 0);
-        if (error) {
-                REMOVE_DEBUG_TRACE(__LINE__);
-                goto std_return;
-        }
-        tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        /*
-         * We try to get the real space reservation first,
-         * allowing for directory btree deletion(s) implying
-         * possible bmap insert(s).  If we can't get the space
-         * reservation then we use 0 instead, and avoid the bmap
-         * btree insert(s) in the directory code by, if the bmap
-         * insert tries to happen, instead trimming the LAST
-         * block from the directory.
-         */
-        resblks = XFS_REMOVE_SPACE_RES(mp);
-        error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
-                        XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
-        if (error == ENOSPC) {
-                resblks = 0;
-                error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
-                                XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
-        }
-        if (error) {
-                ASSERT(error != ENOSPC);
-                cancel_flags = 0;
-                goto error_return;
-        }
-        XFS_BMAP_INIT(&free_list, &first_block);
-        /*
-         * Now lock the child directory inode and the parent directory
-         * inode in the proper order.  This will take care of validating
-         * that the directory entry for the child directory inode has
-         * not changed while we were obtaining a log reservation.
-         */
-        error = xfs_lock_dir_and_entry(dp, cdp);
-        if (error) {
-                xfs_trans_cancel(tp, cancel_flags);
-                goto std_return;
-        }
-        IHOLD(dp);
-        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        IHOLD(cdp);
-        xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
-        ASSERT(cdp->i_d.di_nlink >= 2);
-        if (cdp->i_d.di_nlink != 2) {
-                error = XFS_ERROR(ENOTEMPTY);
-                goto error_return;
-        }
-        if (!xfs_dir_isempty(cdp)) {
-                error = XFS_ERROR(ENOTEMPTY);
-                goto error_return;
-        }
-        error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
-                                        &first_block, &free_list, resblks);
-        if (error)
-                goto error1;
-        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory generation count on the parent
-         * directory so that other can know that it has changed.
-         */
-        dp->i_gen++;
-        /*
-         * Drop the link from cdp's "..".
-         */
-        error = xfs_droplink(tp, dp);
-        if (error) {
-                goto error1;
-        }
-        /*
-         * Drop the link from dp to cdp.
-         */
-        error = xfs_droplink(tp, cdp);
-        if (error) {
-                goto error1;
-        }
-        /*
-         * Drop the "." link from cdp to self.
-         */
-        error = xfs_droplink(tp, cdp);
-        if (error) {
-                goto error1;
-        }
-        /* Determine these before committing transaction */
-        last_cdp_link = (cdp)->i_d.di_nlink==0;
-        /*
-         * If this is a synchronous mount, make sure that the
-         * rmdir transaction goes to disk before returning to
-         * the user.
-         */
-        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-                xfs_trans_set_sync(tp);
-        }
-        error = xfs_bmap_finish (&tp, &free_list, &committed);
-        if (error) {
-                xfs_bmap_cancel(&free_list);
-                xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
-                                 XFS_TRANS_ABORT));
-                goto std_return;
-        }
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error) {
-                goto std_return;
-        }
-        /* Fall through to std_return with error = 0 or the errno
-         * from xfs_trans_commit. */
- std_return:
-        if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-                                        dp, DM_RIGHT_NULL,
-                                        NULL, DM_RIGHT_NULL,
-                                        name->name, NULL, cdp->i_d.di_mode,
-                                        error, 0);
-        }
-        return error;
- error1:
-        xfs_bmap_cancel(&free_list);
-        cancel_flags |= XFS_TRANS_ABORT;
-        /* FALLTHROUGH */
- error_return:
-        xfs_trans_cancel(tp, cancel_flags);
-        goto std_return;
-}
-int
 xfs_symlink(
        xfs_inode_t             *dp,
        struct xfs_name         *link_name,
@@ -2886,7 +2503,7 @@ xfs_symlink(
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
-                        current_fsuid(credp), current_fsgid(credp), prid,
+                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -3181,14 +2798,13 @@ int
 xfs_reclaim(
        xfs_inode_t     *ip)
 {
-        bhv_vnode_t     *vp = XFS_ITOV(ip);
        xfs_itrace_entry(ip);
-        ASSERT(!VN_MAPPED(vp));
+        ASSERT(!VN_MAPPED(VFS_I(ip)));
        /* bad inode, get out here ASAP */
-        if (VN_BAD(vp)) {
+        if (VN_BAD(VFS_I(ip))) {
                xfs_ireclaim(ip);
                return 0;
        }
@@ -3225,7 +2841,7 @@ xfs_reclaim(
                XFS_MOUNT_ILOCK(mp);
                spin_lock(&ip->i_flags_lock);
                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                vn_to_inode(vp)->i_private = NULL;
+                VFS_I(ip)->i_private = NULL;
                ip->i_vnode = NULL;
                spin_unlock(&ip->i_flags_lock);
                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
@@ -3241,8 +2857,7 @@ xfs_finish_reclaim(
        int             sync_mode)
 {
        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
+        struct inode    *vp = VFS_I(ip);
-        int             error;
        if (vp && VN_BAD(vp))
                goto reclaim;
@@ -3285,29 +2900,16 @@ xfs_finish_reclaim(
                xfs_iflock(ip);
        }
-        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        /*
-                if (ip->i_update_core ||
+         * In the case of a forced shutdown we rely on xfs_iflush() to
-                    ((ip->i_itemp != NULL) &&
+         * wait for the inode to be unpinned before returning an error.
-                     (ip->i_itemp->ili_format.ilf_fields != 0))) {
+         */
-                        error = xfs_iflush(ip, sync_mode);
+        if (xfs_iflush(ip, sync_mode) == 0) {
-                        /*
+                /* synchronize with xfs_iflush_done */
-                         * If we hit an error, typically because of filesystem
+                xfs_iflock(ip);
-                         * shutdown, we don't need to let vn_reclaim to know
+                xfs_ifunlock(ip);
-                         * because we're gonna reclaim the inode anyway.
-                         */
-                        if (error) {
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                goto reclaim;
-                        }
-                        xfs_iflock(ip); /* synchronize with xfs_iflush_done */
-                }
-                ASSERT(ip->i_update_core == 0);
-                ASSERT(ip->i_itemp == NULL ||
-                       ip->i_itemp->ili_format.ilf_fields == 0);
        }
-        xfs_ifunlock(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 reclaim:
@@ -3418,7 +3020,7 @@ xfs_alloc_file_space(
        /*      Generate a DMAPI event if needed.       */
        if (alloc_type != 0 && offset < ip->i_size &&
-                        (attr_flags&ATTR_DMI) == 0  &&
+                        (attr_flags & XFS_ATTR_DMI) == 0  &&
                        DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
                xfs_off_t           end_dmi_offset;
@@ -3532,7 +3134,7 @@ retry:
                allocatesize_fsb -= allocated_fsb;
        }
 dmapi_enospc_check:
-        if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
+        if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
                                ip, DM_RIGHT_NULL,
@@ -3558,6 +3160,13 @@ error1:	/* Just cancel transaction */
 /*
 * Zero file bytes between startoff and endoff inclusive.
 * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
 */
 STATIC int
 xfs_zero_remaining_bytes(
@@ -3574,6 +3183,17 @@ xfs_zero_remaining_bytes(
        int                     nimap;
        int                     error = 0;
+        /*
+         * Avoid doing I/O beyond eof - it's not necessary
+         * since nothing can read beyond eof.  The space will
+         * be zeroed when the file is extended anyway.
+         */
+        if (startoff >= ip->i_size)
+                return 0;
+        if (endoff > ip->i_size)
+                endoff = ip->i_size;
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);
@@ -3643,7 +3263,6 @@ xfs_free_file_space(
        xfs_off_t               len,
        int                     attr_flags)
 {
-        bhv_vnode_t             *vp;
        int                     committed;
        int                     done;
        xfs_off_t               end_dmi_offset;
@@ -3663,7 +3282,6 @@ xfs_free_file_space(
        xfs_trans_t             *tp;
        int                     need_iolock = 1;
-        vp = XFS_ITOV(ip);
        mp = ip->i_mount;
        xfs_itrace_entry(ip);
@@ -3679,7 +3297,7 @@ xfs_free_file_space(
        end_dmi_offset = offset + len;
        endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
-        if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
+        if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
                if (end_dmi_offset > ip->i_size)
                        end_dmi_offset = ip->i_size;
@@ -3690,7 +3308,7 @@ xfs_free_file_space(
                        return error;
        }
-        if (attr_flags & ATTR_NOLOCK)
+        if (attr_flags & XFS_ATTR_NOLOCK)
                need_iolock = 0;
        if (need_iolock) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -3700,7 +3318,7 @@ xfs_free_file_space(
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
-        if (VN_CACHED(vp) != 0) {
+        if (VN_CACHED(VFS_I(ip)) != 0) {
                xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
                error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
                if (error)
@@ -3867,7 +3485,7 @@ xfs_change_file_space(
        xfs_off_t       startoffset;
        xfs_off_t       llen;
        xfs_trans_t     *tp;
-        bhv_vattr_t     va;
+        struct iattr    iattr;
        xfs_itrace_entry(ip);
@@ -3941,10 +3559,10 @@ xfs_change_file_space(
                                break;
                }
-                va.va_mask = XFS_AT_SIZE;
+                iattr.ia_valid = ATTR_SIZE;
-                va.va_size = startoffset;
+                iattr.ia_size = startoffset;
-                error = xfs_setattr(ip, &va, attr_flags, credp);
+                error = xfs_setattr(ip, &iattr, attr_flags, credp);
                if (error)
                        return error;
@@ -3974,7 +3592,7 @@ xfs_change_file_space(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_ihold(tp, ip);
-        if ((attr_flags & ATTR_DMI) == 0) {
+        if ((attr_flags & XFS_ATTR_DMI) == 0) {
                ip->i_d.di_mode &= ~S_ISUID;
                /*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 57335ba4ce53..e932a96bec54 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,9 +2,9 @@
 #define _XFS_VNODEOPS_H 1
 struct attrlist_cursor_kern;
-struct bhv_vattr;
 struct cred;
 struct file;
+struct iattr;
 struct inode;
 struct iovec;
 struct kiocb;
@@ -15,14 +15,18 @@ struct xfs_iomap;
 int xfs_open(struct xfs_inode *ip);
-int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
                struct cred *credp);
+#define XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
+#define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
+#define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_fsync(struct xfs_inode *ip);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-                struct xfs_inode **ipp);
+                struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
                xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
@@ -31,8 +35,6 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
                struct xfs_name *target_name);
 int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
                mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
-                struct xfs_inode *cdp);
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,