295 files changed, 7621 insertions, 4941 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 22f7ccd58d38..0f628041e3f7 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -460,8 +460,10 @@ static int __init init_v9fs(void)
        ret = v9fs_mux_global_init();
        if (!ret)
-                ret = register_filesystem(&v9fs_fs_type);
+                return ret;
+        ret = register_filesystem(&v9fs_fs_type);
+        if (!ret)
+                v9fs_mux_global_exit();
        return ret;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index eae50c9d6dc4..7a7ec2d1d2f4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -204,7 +204,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                inode->i_mode = mode;
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
-                inode->i_blksize = sb->s_blocksize;
                inode->i_blocks = 0;
                inode->i_rdev = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -950,9 +949,8 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
        inode->i_size = stat->length;
-        inode->i_blksize = sb->s_blocksize;
        inode->i_blocks =
-            (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits;
+            (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 }
 /**
diff --git a/fs/Kconfig b/fs/Kconfig
index 530581628311..4fd9efac29ab 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -826,6 +826,25 @@ config PROC_VMCORE
        help
        Exports the dump image of crashed kernel in ELF format.
+config PROC_SYSCTL
+        bool "Sysctl support (/proc/sys)" if EMBEDDED
+        depends on PROC_FS
+        select SYSCTL
+        default y
+        ---help---
+          The sysctl interface provides a means of dynamically changing
+          certain kernel parameters and variables on the fly without requiring
+          a recompile of the kernel or reboot of the system.  The primary
+          interface is through /proc/sys.  If you say Y here a tree of
+          modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+          in <file:Documentation/sysctl/>.  Note that enabling this
+          option will enlarge the kernel by at least 8 KB.
+          As it is generally a good thing, you should say Y here unless
+          building a kernel for install/rescue disks or your system is very
+          limited in memory.
 config SYSFS
        bool "sysfs file system support" if EMBEDDED
        default y
@@ -862,6 +881,19 @@ config TMPFS
          See <file:Documentation/filesystems/tmpfs.txt> for details.
+config TMPFS_POSIX_ACL
+        bool "Tmpfs POSIX Access Control Lists"
+        depends on TMPFS
+        select GENERIC_ACL
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N.
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
@@ -1471,8 +1503,8 @@ config NFS_V4
          If unsure, say N.
 config NFS_DIRECTIO
-        bool "Allow direct I/O on NFS files (EXPERIMENTAL)"
+        bool "Allow direct I/O on NFS files"
-        depends on NFS_FS && EXPERIMENTAL
+        depends on NFS_FS
        help
          This option enables applications to perform uncached I/O on files
          in NFS file systems using the O_DIRECT open() flag.  When O_DIRECT
@@ -1921,6 +1953,10 @@ config 9P_FS
          If unsure, say N.
+config GENERIC_ACL
+        bool
+        select FS_POSIX_ACL
 endmenu
 menu "Partition Types"
diff --git a/fs/Makefile b/fs/Makefile
index 89135428a539..46b8cfe497b2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 obj-$(CONFIG_FS_MBCACHE)        += mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)      += posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)        += nfs_common/
+obj-$(CONFIG_GENERIC_ACL)       += generic_acl.o
 obj-$(CONFIG_QUOTA)             += dquot.o
 obj-$(CONFIG_QFMT_V1)           += quota_v1.o
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 534f3eecc985..7e7a04be1278 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -269,7 +269,6 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
        inode->i_ino     = obj->file_id;
        inode->i_size    = obj->size;
        inode->i_nlink   = 2;
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks  = (inode->i_size + sb->s_blocksize - 1) >>
                            sb->s_blocksize_bits;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 82011019494c..9ade139086fc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -251,8 +251,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(adfs_inode_cachep))
+        kmem_cache_destroy(adfs_inode_cachep);
-                printk(KERN_INFO "adfs_inode_cache: not all structures were freed\n");
 }
 static struct super_operations adfs_sops = {
@@ -339,11 +338,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_flags |= MS_NODIRATIME;
-        asb = kmalloc(sizeof(*asb), GFP_KERNEL);
+        asb = kzalloc(sizeof(*asb), GFP_KERNEL);
        if (!asb)
                return -ENOMEM;
        sb->s_fs_info = asb;
-        memset(asb, 0, sizeof(*asb));
        /* set default options */
        asb->s_uid = 0;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0ddd4cc0d1a0..1dc8438ef389 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -1,7 +1,6 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/affs_fs.h>
 #include <linux/amigaffs.h>
 /* AmigaOS allows file names with up to 30 characters length.
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5200f4938df0..5ea72c3a16c3 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/statfs.h>
 #include <linux/parser.h>
+#include <linux/magic.h>
 #include "affs.h"
 extern struct timezone sys_tz;
@@ -108,8 +109,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(affs_inode_cachep))
+        kmem_cache_destroy(affs_inode_cachep);
-                printk(KERN_INFO "affs_inode_cache: not all structures were freed\n");
 }
 static struct super_operations affs_sops = {
@@ -279,11 +279,10 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_op                = &affs_sops;
        sb->s_flags |= MS_NODIRATIME;
-        sbi = kmalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(*sbi));
        init_MUTEX(&sbi->s_bmlock);
        if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4ebb30a50ed5..6f37754906c2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -72,7 +72,6 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
        inode->i_ctime.tv_sec   = vnode->status.mtime_server;
        inode->i_ctime.tv_nsec  = 0;
        inode->i_atime          = inode->i_mtime = inode->i_ctime;
-        inode->i_blksize        = PAGE_CACHE_SIZE;
        inode->i_blocks         = 0;
        inode->i_version        = vnode->fid.unique;
        inode->i_mapping->a_ops = &afs_fs_aops;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 101d21b6c037..86463ec9ccb4 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -775,6 +775,7 @@ static int afs_proc_cell_servers_release(struct inode *inode,
 * first item
 */
 static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
+        __acquires(m->private->sv_lock)
 {
        struct list_head *_p;
        struct afs_cell *cell = m->private;
@@ -823,6 +824,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
 * clean up after reading from the cells list
 */
 static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
+        __releases(p->private->sv_lock)
 {
        struct afs_cell *cell = p->private;
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 331f730a1fb3..782ee7c600ca 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,11 +281,10 @@ int afs_vlocation_lookup(struct afs_cell *cell,
        spin_unlock(&cell->vl_gylock);
        /* not in the cell's in-memory lists - create a new record */
-        vlocation = kmalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
+        vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
        if (!vlocation)
                return -ENOMEM;
-        memset(vlocation, 0, sizeof(struct afs_vlocation));
        atomic_set(&vlocation->usage, 1);
        INIT_LIST_HEAD(&vlocation->link);
        rwlock_init(&vlocation->lock);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 0ff4b86476e3..768c6dbd323a 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -186,11 +186,10 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
        _debug("creating new volume record");
        ret = -ENOMEM;
-        volume = kmalloc(sizeof(struct afs_volume), GFP_KERNEL);
+        volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
        if (!volume)
                goto error_up;
-        memset(volume, 0, sizeof(struct afs_volume));
        atomic_set(&volume->usage, 1);
        volume->type            = type;
        volume->type_force      = force;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index a62327f1bdff..c7700d9b3f96 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -37,8 +37,6 @@
 #define DPRINTK(D) ((void)0)
 #endif
-#define AUTOFS_SUPER_MAGIC 0x0187
 /*
 * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the
 * kernel will keep the negative response cached for up to the time given
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 65e5ed42190e..2c9759baad61 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/parser.h>
 #include <linux/bitops.h>
+#include <linux/magic.h>
 #include "autofs_i.h"
 #include <linux/module.h>
@@ -128,10 +129,9 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
        struct autofs_sb_info *sbi;
        int minproto, maxproto;
-        sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if ( !sbi )
                goto fail_unlock;
-        memset(sbi, 0, sizeof(*sbi));
        DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
        s->s_fs_info = sbi;
@@ -216,7 +216,6 @@ static void autofs_read_inode(struct inode *inode)
        inode->i_nlink = 2;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_blocks = 0;
-        inode->i_blksize = 1024;
        if ( ino == AUTOFS_ROOT_INO ) {
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -241,7 +240,7 @@ static void autofs_read_inode(struct inode *inode)
                
                inode->i_op = &autofs_symlink_inode_operations;
                sl = &sbi->symlink[n];
-                inode->u.generic_ip = sl;
+                inode->i_private = sl;
                inode->i_mode = S_IFLNK | S_IRWXUGO;
                inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
                inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
index 52e8772b066e..c74f2eb65775 100644
--- a/fs/autofs/symlink.c
+++ b/fs/autofs/symlink.c
@@ -15,7 +15,7 @@
 /* Nothing to release.. */
 static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-        char *s=((struct autofs_symlink *)dentry->d_inode->u.generic_ip)->data;
+        char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
        nd_set_link(nd, s);
        return NULL;
 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index d6603d02304c..480ab178cba5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -40,8 +40,6 @@
 #define DPRINTK(fmt,args...) do {} while(0)
 #endif
-#define AUTOFS_SUPER_MAGIC 0x0187
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 8dbd44f10e9d..d96e5c14a9ca 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -32,7 +32,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
        if (!do_now) {
                /* Too young to die */
-                if (time_after(ino->last_used + timeout, now))
+                if (!timeout || time_after(ino->last_used + timeout, now))
                        return 0;
                /* update last_used here :-
@@ -253,7 +253,7 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
        struct dentry *root = dget(sb->s_root);
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
-        if (!sbi->exp_timeout || !root)
+        if (!root)
                return NULL;
        now = jiffies;
@@ -293,7 +293,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
        int do_now = how & AUTOFS_EXP_IMMEDIATE;
        int exp_leaves = how & AUTOFS_EXP_LEAVES;
-        if ( !sbi->exp_timeout || !root )
+        if (!root)
                return NULL;
        now = jiffies;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index fde78b110ddd..800ce876caec 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -19,6 +19,7 @@
 #include <linux/parser.h>
 #include <linux/bitops.h>
 #include <linux/smp_lock.h>
+#include <linux/magic.h>
 #include "autofs_i.h"
 #include <linux/module.h>
@@ -446,7 +447,6 @@ struct inode *autofs4_get_inode(struct super_block *sb,
                inode->i_uid = 0;
                inode->i_gid = 0;
        }
-        inode->i_blksize = PAGE_CACHE_SIZE;
        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 5100f984783f..563ef9d7da9f 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -137,7 +137,9 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
                nd.flags = LOOKUP_DIRECTORY;
                ret = (dentry->d_op->d_revalidate)(dentry, &nd);
-                if (!ret) {
+                if (ret <= 0) {
+                        if (ret < 0)
+                                status = ret;
                        dcache_dir_close(inode, file);
                        goto out;
                }
@@ -279,9 +281,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                DPRINTK("mount done status=%d", status);
-                if (status && dentry->d_inode)
-                        return status; /* Try to get the kernel to invalidate this dentry */
                /* Turn this into a real negative dentry? */
                if (status == -ENOENT) {
                        spin_lock(&dentry->d_lock);
@@ -357,7 +356,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
         * don't try to mount it again.
         */
        spin_lock(&dcache_lock);
-        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+        if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
                spin_unlock(&dcache_lock);
                status = try_to_fill_dentry(dentry, 0);
@@ -400,13 +399,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
        int oz_mode = autofs4_oz_mode(sbi);
        int flags = nd ? nd->flags : 0;
-        int status = 0;
+        int status = 1;
        /* Pending dentry */
        if (autofs4_ispending(dentry)) {
-                if (!oz_mode)
+                /* The daemon never causes a mount to trigger */
-                        status = try_to_fill_dentry(dentry, flags);
+                if (oz_mode)
-                return !status;
+                        return 1;
+                /*
+                 * A zero status is success otherwise we have a
+                 * negative error code.
+                 */
+                status = try_to_fill_dentry(dentry, flags);
+                if (status == 0)
+                                return 1;
+                return status;
        }
        /* Negative dentry.. invalidate if "old" */
@@ -421,9 +430,19 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                DPRINTK("dentry=%p %.*s, emptydir",
                         dentry, dentry->d_name.len, dentry->d_name.name);
                spin_unlock(&dcache_lock);
-                if (!oz_mode)
+                /* The daemon never causes a mount to trigger */
-                        status = try_to_fill_dentry(dentry, flags);
+                if (oz_mode)
-                return !status;
+                        return 1;
+                /*
+                 * A zero status is success otherwise we have a
+                 * negative error code.
+                 */
+                status = try_to_fill_dentry(dentry, flags);
+                if (status == 0)
+                        return 1;
+                return status;
        }
        spin_unlock(&dcache_lock);
@@ -518,6 +537,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
                            return ERR_PTR(-ERESTARTNOINTR);
                        }
                }
+                spin_lock(&dentry->d_lock);
+                dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+                spin_unlock(&dentry->d_lock);
        }
        /*
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 50cfca5c7efd..57020c7a7e65 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -365,7 +365,6 @@ befs_read_inode(struct inode *inode)
        inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */        
        inode->i_ctime = inode->i_mtime;
        inode->i_atime = inode->i_mtime;
-        inode->i_blksize = befs_sb->block_size;
        befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
        befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
@@ -446,9 +445,7 @@ befs_init_inodecache(void)
 static void
 befs_destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(befs_inode_cachep))
+        kmem_cache_destroy(befs_inode_cachep);
-                printk(KERN_ERR "befs_destroy_inodecache: "
-                       "not all structures were freed\n");
 }
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 26fad9621738..dcf04cb13283 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -102,7 +102,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
        inode->i_uid = current->fsuid;
        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        inode->i_op = &bfs_file_inops;
        inode->i_fop = &bfs_file_operations;
        inode->i_mapping->a_ops = &bfs_aops;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cf74f3d4d966..ed27ffb3459e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -76,7 +76,6 @@ static void bfs_read_inode(struct inode * inode)
        inode->i_size = BFS_FILESIZE(di);
        inode->i_blocks = BFS_FILEBLOCKS(di);
        if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
-        inode->i_blksize = PAGE_SIZE;
        inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
        inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
        inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
@@ -268,8 +267,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(bfs_inode_cachep))
+        kmem_cache_destroy(bfs_inode_cachep);
-                printk(KERN_INFO "bfs_inode_cache: not all structures were freed\n");
 }
 static struct super_operations bfs_sops = {
@@ -311,11 +309,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        unsigned i, imap_len;
        struct bfs_sb_info * info;
-        info = kmalloc(sizeof(*info), GFP_KERNEL);
+        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
                return -ENOMEM;
        s->s_fs_info = info;
-        memset(info, 0, sizeof(*info));
        sb_set_blocksize(s, BFS_BSIZE);
@@ -338,10 +335,9 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                        + BFS_ROOT_INO - 1;
        imap_len = info->si_lasti/8 + 1;
-        info->si_imap = kmalloc(imap_len, GFP_KERNEL);
+        info->si_imap = kzalloc(imap_len, GFP_KERNEL);
        if (!info->si_imap)
                goto out;
-        memset(info->si_imap, 0, imap_len);
        for (i=0; i<BFS_ROOT_INO; i++) 
                set_bit(i, info->si_imap);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f312103434d4..517e111bb7ef 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -278,6 +278,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                return -ENOEXEC;
        }
+        /*
+         * Requires a mmap handler. This prevents people from using a.out
+         * as part of an exploit attack against /proc-related vulnerabilities.
+         */
+        if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+                return -ENOEXEC;
        fd_offset = N_TXTOFF(ex);
        /* Check initial limits. This avoids letting people circumvent
@@ -476,6 +483,13 @@ static int load_aout_library(struct file *file)
                goto out;
        }
+        /*
+         * Requires a mmap handler. This prevents people from using a.out
+         * as part of an exploit attack against /proc-related vulnerabilities.
+         */
+        if (!file->f_op || !file->f_op->mmap)
+                goto out;
        if (N_FLAGS(ex))
                goto out;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 672a3b90bc55..6eb48e1446ec 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -515,7 +515,8 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 {
        unsigned int random_variable = 0;
-        if (current->flags & PF_RANDOMIZE) {
+        if ((current->flags & PF_RANDOMIZE) &&
+                !(current->personality & ADDR_NO_RANDOMIZE)) {
                random_variable = get_random_int() & STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
@@ -1037,10 +1038,8 @@ out_free_interp:
 out_free_file:
        sys_close(elf_exec_fileno);
 out_free_fh:
-        if (files) {
+        if (files)
-                put_files_struct(current->files);
+                reset_files_struct(current, files);
-                current->files = files;
-        }
 out_free_ph:
        kfree(elf_phdata);
        goto out;
@@ -1262,7 +1261,7 @@ static void fill_elf_header(struct elfhdr *elf, int segs)
        return;
 }
-static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset)
+static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
 {
        phdr->p_type = PT_NOTE;
        phdr->p_offset = offset;
@@ -1428,7 +1427,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
        int i;
        struct vm_area_struct *vma;
        struct elfhdr *elf = NULL;
-        off_t offset = 0, dataoff;
+        loff_t offset = 0, dataoff;
        unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
        int numnote;
        struct memelfnote *notes = NULL;
@@ -1480,20 +1479,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
        if (signr) {
                struct elf_thread_status *tmp;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                do_each_thread(g,p)
                        if (current->mm == p->mm && current != p) {
                                tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
                                if (!tmp) {
-                                        read_unlock(&tasklist_lock);
+                                        rcu_read_unlock();
                                        goto cleanup;
                                }
-                                INIT_LIST_HEAD(&tmp->list);
                                tmp->thread = p;
                                list_add(&tmp->list, &thread_list);
                        }
                while_each_thread(g,p);
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                list_for_each(t, &thread_list) {
                        struct elf_thread_status *tmp;
                        int sz;
@@ -1661,11 +1659,11 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
        ELF_CORE_WRITE_EXTRA_DATA;
 #endif
-        if ((off_t)file->f_pos != offset) {
+        if (file->f_pos != offset) {
                /* Sanity check */
                printk(KERN_WARNING
-                       "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
+                       "elf_core_dump: file->f_pos (%Ld) != offset (%Ld)\n",
-                       (off_t)file->f_pos, offset);
+                       file->f_pos, offset);
        }
 end_coredump:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2f3365829229..f86d5c9ce5eb 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1597,20 +1597,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        if (signr) {
                struct elf_thread_status *tmp;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                do_each_thread(g,p)
                        if (current->mm == p->mm && current != p) {
                                tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
                                if (!tmp) {
-                                        read_unlock(&tasklist_lock);
+                                        rcu_read_unlock();
                                        goto cleanup;
                                }
-                                INIT_LIST_HEAD(&tmp->list);
                                tmp->thread = p;
                                list_add(&tmp->list, &thread_list);
                        }
                while_each_thread(g,p);
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                list_for_each(t, &thread_list) {
                        struct elf_thread_status *tmp;
                        int sz;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 34ebbc191e46..1713c48fef54 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -215,10 +215,8 @@ _error:
        bprm->interp_flags = 0;
        bprm->interp_data = 0;
 _unshare:
-        if (files) {
+        if (files)
-                put_files_struct(current->files);
+                reset_files_struct(current, files);
-                current->files = files;
-        }
        goto _ret;
 }
@@ -507,7 +505,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
                inode->i_mode = mode;
                inode->i_uid = 0;
                inode->i_gid = 0;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
@@ -517,7 +514,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 static void bm_clear_inode(struct inode *inode)
 {
-        kfree(inode->u.generic_ip);
+        kfree(inode->i_private);
 }
 static void kill_node(Node *e)
@@ -545,7 +542,7 @@ static void kill_node(Node *e)
 static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
-        Node *e = file->f_dentry->d_inode->u.generic_ip;
+        Node *e = file->f_dentry->d_inode->i_private;
        loff_t pos = *ppos;
        ssize_t res;
        char *page;
@@ -579,7 +576,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
                                size_t count, loff_t *ppos)
 {
        struct dentry *root;
-        Node *e = file->f_dentry->d_inode->u.generic_ip;
+        Node *e = file->f_dentry->d_inode->i_private;
        int res = parse_command(buffer, count);
        switch (res) {
@@ -646,7 +643,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
        }
        e->dentry = dget(dentry);
-        inode->u.generic_ip = e;
+        inode->i_private = e;
        inode->i_fop = &bm_entry_operations;
        d_instantiate(dentry, inode);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 045f98854f14..4346468139e8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -543,11 +543,11 @@ static struct kobject *bdev_get_holder(struct block_device *bdev)
                return kobject_get(bdev->bd_disk->holder_dir);
 }
-static void add_symlink(struct kobject *from, struct kobject *to)
+static int add_symlink(struct kobject *from, struct kobject *to)
 {
        if (!from || !to)
-                return;
+                return 0;
-        sysfs_create_link(from, to, kobject_name(to));
+        return sysfs_create_link(from, to, kobject_name(to));
 }
 static void del_symlink(struct kobject *from, struct kobject *to)
@@ -648,30 +648,38 @@ static void free_bd_holder(struct bd_holder *bo)
 * If there is no matching entry with @bo in @bdev->bd_holder_list,
 * add @bo to the list, create symlinks.
 *
- * Returns 1 if @bo was added to the list.
+ * Returns 0 if symlinks are created or already there.
- * Returns 0 if @bo wasn't used by any reason and should be freed.
+ * Returns -ve if something fails and @bo can be freed.
 */
 static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
 {
        struct bd_holder *tmp;
+        int ret;
        if (!bo)
-                return 0;
+                return -EINVAL;
        list_for_each_entry(tmp, &bdev->bd_holder_list, list) {
                if (tmp->sdir == bo->sdir) {
                        tmp->count++;
+                        /* We've already done what we need to do here. */
+                        free_bd_holder(bo);
                        return 0;
                }
        }
        if (!bd_holder_grab_dirs(bdev, bo))
-                return 0;
+                return -EBUSY;
-        add_symlink(bo->sdir, bo->sdev);
+        ret = add_symlink(bo->sdir, bo->sdev);
-        add_symlink(bo->hdir, bo->hdev);
+        if (ret == 0) {
-        list_add_tail(&bo->list, &bdev->bd_holder_list);
+                ret = add_symlink(bo->hdir, bo->hdev);
-        return 1;
+                if (ret)
+                        del_symlink(bo->sdir, bo->sdev);
+        }
+        if (ret == 0)
+                list_add_tail(&bo->list, &bdev->bd_holder_list);
+        return ret;
 }
 /**
@@ -741,7 +749,9 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
        mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION);
        res = bd_claim(bdev, holder);
-        if (res || !add_bd_holder(bdev, bo))
+        if (res == 0)
+                res = add_bd_holder(bdev, bo);
+        if (res)
                free_bd_holder(bo);
        mutex_unlock(&bdev->bd_mutex);
@@ -1021,7 +1031,7 @@ do_open(struct block_device *bdev, struct file *file, unsigned int subclass)
                                rescan_partitions(bdev->bd_disk, bdev);
                } else {
                        mutex_lock_nested(&bdev->bd_contains->bd_mutex,
-                                          BD_MUTEX_PARTITION);
+                                          BD_MUTEX_WHOLE);
                        bdev->bd_contains->bd_part_count++;
                        mutex_unlock(&bdev->bd_contains->bd_mutex);
                }
diff --git a/fs/buffer.c b/fs/buffer.c
index 71649ef9b658..3b6d701073e7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2987,6 +2987,7 @@ int try_to_free_buffers(struct page *page)
        spin_lock(&mapping->private_lock);
        ret = drop_buffers(page, &buffers_to_free);
+        spin_unlock(&mapping->private_lock);
        if (ret) {
                /*
                 * If the filesystem writes its buffers by hand (eg ext3)
@@ -2998,7 +2999,6 @@ int try_to_free_buffers(struct page *page)
                 */
                clear_page_dirty(page);
        }
-        spin_unlock(&mapping->private_lock);
 out:
        if (buffers_to_free) {
                struct buffer_head *bh = buffers_to_free;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3483d3cf8087..1f3285affa39 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -19,11 +19,30 @@
 #include <linux/kobj_map.h>
 #include <linux/cdev.h>
 #include <linux/mutex.h>
+#include <linux/backing-dev.h>
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
+/*
+ * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
+ * devices
+ * - permits shared-mmap for read, write and/or exec
+ * - does not permit private mmap in NOMMU mode (can't do COW)
+ * - no readahead or I/O queue unplugging required
+ */
+struct backing_dev_info directly_mappable_cdev_bdi = {
+        .capabilities   = (
+#ifdef CONFIG_MMU
+                /* permit private copies of the data to be taken */
+                BDI_CAP_MAP_COPY |
+#endif
+                /* permit direct mmap, for read, write or exec */
+                BDI_CAP_MAP_DIRECT |
+                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+};
 static struct kobj_map *cdev_map;
 static DEFINE_MUTEX(chrdevs_lock);
@@ -109,13 +128,31 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
        for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
                if ((*cp)->major > major ||
-                    ((*cp)->major == major && (*cp)->baseminor >= baseminor))
+                    ((*cp)->major == major &&
+                     (((*cp)->baseminor >= baseminor) ||
+                      ((*cp)->baseminor + (*cp)->minorct > baseminor))))
                        break;
-        if (*cp && (*cp)->major == major &&
-            (*cp)->baseminor < baseminor + minorct) {
+        /* Check for overlapping minor ranges.  */
-                ret = -EBUSY;
+        if (*cp && (*cp)->major == major) {
-                goto out;
+                int old_min = (*cp)->baseminor;
+                int old_max = (*cp)->baseminor + (*cp)->minorct - 1;
+                int new_min = baseminor;
+                int new_max = baseminor + minorct - 1;
+                /* New driver overlaps from the left.  */
+                if (new_max >= old_min && new_max <= old_max) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                /* New driver overlaps from the right.  */
+                if (new_min <= old_max && new_min >= old_min) {
+                        ret = -EBUSY;
+                        goto out;
+                }
        }
        cd->next = *cp;
        *cp = cd;
        mutex_unlock(&chrdevs_lock);
@@ -146,6 +183,15 @@ __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
        return cd;
 }
+/**
+ * register_chrdev_region() - register a range of device numbers
+ * @from: the first in the desired range of device numbers; must include
+ *        the major number.
+ * @count: the number of consecutive device numbers required
+ * @name: the name of the device or driver.
+ *
+ * Return value is zero on success, a negative error code on failure.
+ */
 int register_chrdev_region(dev_t from, unsigned count, const char *name)
 {
        struct char_device_struct *cd;
@@ -171,6 +217,17 @@ fail:
        return PTR_ERR(cd);
 }
+/**
+ * alloc_chrdev_region() - register a range of char device numbers
+ * @dev: output parameter for first assigned number
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
+ * @name: the name of the associated device or driver
+ *
+ * Allocates a range of char device numbers.  The major number will be
+ * chosen dynamically, and returned (along with the first minor number)
+ * in @dev.  Returns zero or a negative error code.
+ */
 int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
                        const char *name)
 {
@@ -240,6 +297,15 @@ out2:
        return err;
 }
+/**
+ * unregister_chrdev_region() - return a range of device numbers
+ * @from: the first in the range of numbers to unregister
+ * @count: the number of device numbers to unregister
+ *
+ * This function will unregister a range of @count device numbers,
+ * starting with @from.  The caller should normally be the one who
+ * allocated those numbers in the first place...
+ */
 void unregister_chrdev_region(dev_t from, unsigned count)
 {
        dev_t to = from + count;
@@ -377,6 +443,16 @@ static int exact_lock(dev_t dev, void *data)
        return cdev_get(p) ? 0 : -1;
 }
+/**
+ * cdev_add() - add a char device to the system
+ * @p: the cdev structure for the device
+ * @dev: the first device number for which this device is responsible
+ * @count: the number of consecutive minor numbers corresponding to this
+ *         device
+ *
+ * cdev_add() adds the device represented by @p to the system, making it
+ * live immediately.  A negative error code is returned on failure.
+ */
 int cdev_add(struct cdev *p, dev_t dev, unsigned count)
 {
        p->dev = dev;
@@ -389,6 +465,13 @@ static void cdev_unmap(dev_t dev, unsigned count)
        kobj_unmap(cdev_map, dev, count);
 }
+/**
+ * cdev_del() - remove a cdev from the system
+ * @p: the cdev structure to be removed
+ *
+ * cdev_del() removes @p from the system, possibly freeing the structure
+ * itself.
+ */
 void cdev_del(struct cdev *p)
 {
        cdev_unmap(p->dev, p->count);
@@ -417,6 +500,11 @@ static struct kobj_type ktype_cdev_dynamic = {
        .release        = cdev_dynamic_release,
 };
+/**
+ * cdev_alloc() - allocate a cdev structure
+ *
+ * Allocates and returns a cdev structure, or NULL on failure.
+ */
 struct cdev *cdev_alloc(void)
 {
        struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
@@ -428,6 +516,14 @@ struct cdev *cdev_alloc(void)
        return p;
 }
+/**
+ * cdev_init() - initialize a cdev structure
+ * @cdev: the structure to initialize
+ * @fops: the file_operations for this device
+ *
+ * Initializes @cdev, remembering @fops, making it ready to add to the
+ * system with cdev_add().
+ */
 void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
        memset(cdev, 0, sizeof *cdev);
@@ -461,3 +557,4 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c3ef1c0d0e68..22bcf4d7e7ae 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -253,7 +253,6 @@ cifs_alloc_inode(struct super_block *sb)
        file data or metadata */
        cifs_inode->clientCanCacheRead = FALSE;
        cifs_inode->clientCanCacheAll = FALSE;
-        cifs_inode->vfs_inode.i_blksize = CIFS_MAX_MSGSIZE;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;
        INIT_LIST_HEAD(&cifs_inode->openFileList);
@@ -699,8 +698,7 @@ cifs_init_inodecache(void)
 static void
 cifs_destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(cifs_inode_cachep))
+        kmem_cache_destroy(cifs_inode_cachep);
-                printk(KERN_WARNING "cifs_inode_cache: error freeing\n");
 }
 static int
@@ -778,13 +776,9 @@ static void
 cifs_destroy_request_bufs(void)
 {
        mempool_destroy(cifs_req_poolp);
-        if (kmem_cache_destroy(cifs_req_cachep))
+        kmem_cache_destroy(cifs_req_cachep);
-                printk(KERN_WARNING
-                       "cifs_destroy_request_cache: error not all structures were freed\n");
        mempool_destroy(cifs_sm_req_poolp);
-        if (kmem_cache_destroy(cifs_sm_req_cachep))
+        kmem_cache_destroy(cifs_sm_req_cachep);
-                printk(KERN_WARNING
-                      "cifs_destroy_request_cache: cifs_small_rq free error\n");
 }
 static int
@@ -819,13 +813,8 @@ static void
 cifs_destroy_mids(void)
 {
        mempool_destroy(cifs_mid_poolp);
-        if (kmem_cache_destroy(cifs_mid_cachep))
+        kmem_cache_destroy(cifs_mid_cachep);
-                printk(KERN_WARNING
+        kmem_cache_destroy(cifs_oplock_cachep);
-                       "cifs_destroy_mids: error not all structures were freed\n");
-        if (kmem_cache_destroy(cifs_oplock_cachep))
-                printk(KERN_WARNING
-                       "error not all oplock structures were freed\n");
 }
 static int cifs_oplock_thread(void * dummyarg)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9aeb58a7d369..b27b34537bf2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -216,10 +216,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
        if (allocation_size < end_of_file)
                cFYI(1, ("May be sparse file, allocation less than file size"));
-        cFYI(1, ("File Size %ld and blocks %llu and blocksize %ld",
+        cFYI(1, ("File Size %ld and blocks %llu",
                (unsigned long)tmp_inode->i_size,
-                (unsigned long long)tmp_inode->i_blocks,
+                (unsigned long long)tmp_inode->i_blocks));
-                tmp_inode->i_blksize));
        if (S_ISREG(tmp_inode->i_mode)) {
                cFYI(1, ("File inode"));
                tmp_inode->i_op = &cifs_file_inode_ops;
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 5597080cb811..95a54253c047 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -110,8 +110,6 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
                inode->i_nlink = attr->va_nlink;
        if (attr->va_size != -1)
                inode->i_size = attr->va_size;
-        if (attr->va_blocksize != -1)
-                inode->i_blksize = attr->va_blocksize;
        if (attr->va_size != -1)
                inode->i_blocks = (attr->va_size + 511) >> 9;
        if (attr->va_atime.tv_sec != -1) 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 71f2ea632e53..8651ea6a23b7 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -513,7 +513,7 @@ static int coda_venus_readdir(struct file *filp, filldir_t filldir,
        ino_t ino;
        int ret, i;
-        vdir = (struct venus_dirent *)kmalloc(sizeof(*vdir), GFP_KERNEL);
+        vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
        if (!vdir) return -ENOMEM;
        i = filp->f_pos;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 87f1dc8aa24b..88d123321164 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -80,8 +80,7 @@ int coda_init_inodecache(void)
 void coda_destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(coda_inode_cachep))
+        kmem_cache_destroy(coda_inode_cachep);
-                printk(KERN_INFO "coda_inode_cache: not all structures were freed\n");
 }
 static int coda_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/compat.c b/fs/compat.c
index e31e9cf96647..ce982f6e8c80 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1855,7 +1855,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
        } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
-        if (tsp && !(current->personality & STICKY_TIMEOUTS)) {
+        if (ret == 0 && tsp && !(current->personality & STICKY_TIMEOUTS)) {
                struct compat_timespec rts;
                rts.tv_sec = timeout / HZ;
@@ -1866,7 +1866,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
                }
                if (compat_timespec_compare(&rts, &ts) >= 0)
                        rts = ts;
-                copy_to_user(tsp, &rts, sizeof(rts));
+                if (copy_to_user(tsp, &rts, sizeof(rts)))
+                        ret = -EFAULT;
        }
        if (ret == -ERESTARTNOHAND) {
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index f499803743e0..85105e50f7db 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -274,9 +274,8 @@ static int check_perm(struct inode * inode, struct file * file)
        /* No error? Great, allocate a buffer for the file, and store it
         * it in file->private_data for easy access.
         */
-        buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+        buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
        if (buffer) {
-                memset(buffer,0,sizeof(struct configfs_buffer));
                init_MUTEX(&buffer->sem);
                buffer->needs_read_fill = 1;
                buffer->ops = ops;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index e14488ca6411..fb18917954a9 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -76,11 +76,10 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
        if (!sd_iattr) {
                /* setting attributes for the first time, allocate now */
-                sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL);
+                sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
                if (!sd_iattr)
                        return -ENOMEM;
                /* assign default attributes */
-                memset(sd_iattr, 0, sizeof(struct iattr));
                sd_iattr->ia_mode = sd->s_mode;
                sd_iattr->ia_uid = 0;
                sd_iattr->ia_gid = 0;
@@ -136,7 +135,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 223c0431042d..a624c3ec8189 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -73,7 +73,6 @@ static int cramfs_iget5_set(struct inode *inode, void *opaque)
        inode->i_uid = cramfs_inode->uid;
        inode->i_size = cramfs_inode->size;
        inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-        inode->i_blksize = PAGE_CACHE_SIZE;
        inode->i_gid = cramfs_inode->gid;
        /* Struct copy intentional */
        inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
@@ -242,11 +241,10 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_flags |= MS_RDONLY;
-        sbi = kmalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(struct cramfs_sb_info));
        /* Invalidate the read buffers on mount: think disk change.. */
        mutex_lock(&read_mutex);
@@ -545,8 +543,15 @@ static struct file_system_type cramfs_fs_type = {
 static int __init init_cramfs_fs(void)
 {
-        cramfs_uncompress_init();
+        int rv;
-        return register_filesystem(&cramfs_fs_type);
+        rv = cramfs_uncompress_init();
+        if (rv < 0)
+                return rv;
+        rv = register_filesystem(&cramfs_fs_type);
+        if (rv < 0)
+                cramfs_uncompress_exit();
+        return rv;
 }
 static void __exit exit_cramfs_fs(void)
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 8def89f2c438..fc3ccb74626f 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -68,11 +68,10 @@ int cramfs_uncompress_init(void)
        return 0;
 }
-int cramfs_uncompress_exit(void)
+void cramfs_uncompress_exit(void)
 {
        if (!--initialized) {
                zlib_inflateEnd(&stream);
                vfree(stream.workspace);
        }
-        return 0;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 1b4a3a34ec57..17b392a2049e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -828,17 +828,19 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
-struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
+static struct dentry *__d_instantiate_unique(struct dentry *entry,
+                                             struct inode *inode)
 {
        struct dentry *alias;
        int len = entry->d_name.len;
        const char *name = entry->d_name.name;
        unsigned int hash = entry->d_name.hash;
-        BUG_ON(!list_empty(&entry->d_alias));
+        if (!inode) {
-        spin_lock(&dcache_lock);
+                entry->d_inode = NULL;
-        if (!inode)
+                return NULL;
-                goto do_negative;
+        }
        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
                struct qstr *qstr = &alias->d_name;
@@ -851,19 +853,35 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
                if (memcmp(qstr->name, name, len))
                        continue;
                dget_locked(alias);
-                spin_unlock(&dcache_lock);
-                BUG_ON(!d_unhashed(alias));
-                iput(inode);
                return alias;
        }
        list_add(&entry->d_alias, &inode->i_dentry);
-do_negative:
        entry->d_inode = inode;
        fsnotify_d_instantiate(entry, inode);
-        spin_unlock(&dcache_lock);
-        security_d_instantiate(entry, inode);
        return NULL;
 }
+struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
+{
+        struct dentry *result;
+        BUG_ON(!list_empty(&entry->d_alias));
+        spin_lock(&dcache_lock);
+        result = __d_instantiate_unique(entry, inode);
+        spin_unlock(&dcache_lock);
+        if (!result) {
+                security_d_instantiate(entry, inode);
+                return NULL;
+        }
+        BUG_ON(!d_unhashed(result));
+        iput(inode);
+        return result;
+}
 EXPORT_SYMBOL(d_instantiate_unique);
 /**
@@ -1235,6 +1253,11 @@ static void __d_rehash(struct dentry * entry, struct hlist_head *list)
        hlist_add_head_rcu(&entry->d_hash, list);
 }
+static void _d_rehash(struct dentry * entry)
+{
+        __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
+}
 /**
 * d_rehash     - add an entry back to the hash
 * @entry: dentry to add to the hash
@@ -1244,11 +1267,9 @@ static void __d_rehash(struct dentry * entry, struct hlist_head *list)
 
 void d_rehash(struct dentry * entry)
 {
-        struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
        spin_lock(&dcache_lock);
        spin_lock(&entry->d_lock);
-        __d_rehash(entry, list);
+        _d_rehash(entry);
        spin_unlock(&entry->d_lock);
        spin_unlock(&dcache_lock);
 }
@@ -1386,6 +1407,120 @@ already_unhashed:
        spin_unlock(&dcache_lock);
 }
+/*
+ * Prepare an anonymous dentry for life in the superblock's dentry tree as a
+ * named dentry in place of the dentry to be replaced.
+ */
+static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
+{
+        struct dentry *dparent, *aparent;
+        switch_names(dentry, anon);
+        do_switch(dentry->d_name.len, anon->d_name.len);
+        do_switch(dentry->d_name.hash, anon->d_name.hash);
+        dparent = dentry->d_parent;
+        aparent = anon->d_parent;
+        dentry->d_parent = (aparent == anon) ? dentry : aparent;
+        list_del(&dentry->d_u.d_child);
+        if (!IS_ROOT(dentry))
+                list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+        else
+                INIT_LIST_HEAD(&dentry->d_u.d_child);
+        anon->d_parent = (dparent == dentry) ? anon : dparent;
+        list_del(&anon->d_u.d_child);
+        if (!IS_ROOT(anon))
+                list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs);
+        else
+                INIT_LIST_HEAD(&anon->d_u.d_child);
+        anon->d_flags &= ~DCACHE_DISCONNECTED;
+}
+/**
+ * d_materialise_unique - introduce an inode into the tree
+ * @dentry: candidate dentry
+ * @inode: inode to bind to the dentry, to which aliases may be attached
+ *
+ * Introduces an dentry into the tree, substituting an extant disconnected
+ * root directory alias in its place if there is one
+ */
+struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
+{
+        struct dentry *alias, *actual;
+        BUG_ON(!d_unhashed(dentry));
+        spin_lock(&dcache_lock);
+        if (!inode) {
+                actual = dentry;
+                dentry->d_inode = NULL;
+                goto found_lock;
+        }
+        /* See if a disconnected directory already exists as an anonymous root
+         * that we should splice into the tree instead */
+        if (S_ISDIR(inode->i_mode) && (alias = __d_find_alias(inode, 1))) {
+                spin_lock(&alias->d_lock);
+                /* Is this a mountpoint that we could splice into our tree? */
+                if (IS_ROOT(alias))
+                        goto connect_mountpoint;
+                if (alias->d_name.len == dentry->d_name.len &&
+                    alias->d_parent == dentry->d_parent &&
+                    memcmp(alias->d_name.name,
+                           dentry->d_name.name,
+                           dentry->d_name.len) == 0)
+                        goto replace_with_alias;
+                spin_unlock(&alias->d_lock);
+                /* Doh! Seem to be aliasing directories for some reason... */
+                dput(alias);
+        }
+        /* Add a unique reference */
+        actual = __d_instantiate_unique(dentry, inode);
+        if (!actual)
+                actual = dentry;
+        else if (unlikely(!d_unhashed(actual)))
+                goto shouldnt_be_hashed;
+found_lock:
+        spin_lock(&actual->d_lock);
+found:
+        _d_rehash(actual);
+        spin_unlock(&actual->d_lock);
+        spin_unlock(&dcache_lock);
+        if (actual == dentry) {
+                security_d_instantiate(dentry, inode);
+                return NULL;
+        }
+        iput(inode);
+        return actual;
+        /* Convert the anonymous/root alias into an ordinary dentry */
+connect_mountpoint:
+        __d_materialise_dentry(dentry, alias);
+        /* Replace the candidate dentry with the alias in the tree */
+replace_with_alias:
+        __d_drop(alias);
+        actual = alias;
+        goto found;
+shouldnt_be_hashed:
+        spin_unlock(&dcache_lock);
+        BUG();
+        goto shouldnt_be_hashed;
+}
 /**
 * d_path - return the path of a dentry
 * @dentry: dentry to report
@@ -1784,6 +1919,7 @@ EXPORT_SYMBOL(d_instantiate);
 EXPORT_SYMBOL(d_invalidate);
 EXPORT_SYMBOL(d_lookup);
 EXPORT_SYMBOL(d_move);
+EXPORT_SYMBOL_GPL(d_materialise_unique);
 EXPORT_SYMBOL(d_path);
 EXPORT_SYMBOL(d_prune_aliases);
 EXPORT_SYMBOL(d_rehash);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 39640fd03458..bf3901ab1744 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -32,8 +32,8 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 static int default_open(struct inode *inode, struct file *file)
 {
-        if (inode->u.generic_ip)
+        if (inode->i_private)
-                file->private_data = inode->u.generic_ip;
+                file->private_data = inode->i_private;
        return 0;
 }
@@ -55,12 +55,11 @@ static u64 debugfs_u8_get(void *data)
 DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
 /**
- * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write an unsigned 8 bit value.
+ * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
- *
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
@@ -72,11 +71,11 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
@@ -97,12 +96,11 @@ static u64 debugfs_u16_get(void *data)
 DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
 /**
- * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write an unsigned 16 bit value.
+ * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
- *
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
@@ -114,11 +112,11 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
@@ -139,12 +137,11 @@ static u64 debugfs_u32_get(void *data)
 DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
 /**
- * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write an unsigned 32 bit value.
+ * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
- *
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
@@ -156,11 +153,11 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_u32(const char *name, mode_t mode,
@@ -219,12 +216,11 @@ static const struct file_operations fops_bool = {
 };
 /**
- * debugfs_create_bool - create a file in the debugfs filesystem that is used to read and write a boolean value.
+ * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
- *
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
@@ -236,11 +232,11 @@ static const struct file_operations fops_bool = {
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_bool(const char *name, mode_t mode,
@@ -264,13 +260,11 @@ static struct file_operations fops_blob = {
 };
 /**
- * debugfs_create_blob - create a file in the debugfs filesystem that is
+ * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob
- * used to read and write a binary blob.
- *
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer
 *        to the blob data and the size of the data.
@@ -282,11 +276,11 @@ static struct file_operations fops_blob = {
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_blob(const char *name, mode_t mode,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e8ae3042b806..269e649e6dc6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,7 +40,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
                inode->i_mode = mode;
                inode->i_uid = 0;
                inode->i_gid = 0;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
@@ -162,14 +161,13 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 /**
 * debugfs_create_file - create a file in the debugfs filesystem
- *
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this paramater is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @data: a pointer to something that the caller will want to get to later
- *        on.  The inode.u.generic_ip pointer will point to this value on
+ *        on.  The inode.i_private pointer will point to this value on
 *        the open() call.
 * @fops: a pointer to a struct file_operations that should be used for
 *        this file.
@@ -182,11 +180,11 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
@@ -210,7 +208,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
        if (dentry->d_inode) {
                if (data)
-                        dentry->d_inode->u.generic_ip = data;
+                        dentry->d_inode->i_private = data;
                if (fops)
                        dentry->d_inode->i_fop = fops;
        }
@@ -221,7 +219,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
 /**
 * debugfs_create_dir - create a directory in the debugfs filesystem
- *
 * @name: a pointer to a string containing the name of the directory to
 *        create.
 * @parent: a pointer to the parent dentry for this file.  This should be a
@@ -233,11 +230,11 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.)  If an error occurs, NULL will be returned.
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
 *
- * If debugfs is not enabled in the kernel, the value -ENODEV will be
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.  It is not wise to check for this value, but rather, check for
- * NULL or !NULL instead as to eliminate the need for #ifdef in the calling
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
 * code.
 */
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
@@ -250,7 +247,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
 /**
 * debugfs_remove - removes a file or directory from the debugfs filesystem
- *
 * @dentry: a pointer to a the dentry of the file or directory to be
 *          removed.
 *
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f7aef5bb584a..5f7b5a6025bf 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -113,7 +113,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_blocks = 0;
-        inode->i_blksize = 1024;
        inode->i_uid = inode->i_gid = 0;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
@@ -172,12 +171,11 @@ int devpts_pty_new(struct tty_struct *tty)
                return -ENOMEM;
        inode->i_ino = number+2;
-        inode->i_blksize = 1024;
        inode->i_uid = config.setuid ? config.uid : current->fsuid;
        inode->i_gid = config.setgid ? config.gid : current->fsgid;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        init_special_inode(inode, S_IFCHR|config.mode, device);
-        inode->u.generic_ip = tty;
+        inode->i_private = tty;
        dentry = get_node(number);
        if (!IS_ERR(dentry) && !dentry->d_inode)
@@ -196,7 +194,7 @@ struct tty_struct *devpts_get_tty(int number)
        tty = NULL;
        if (!IS_ERR(dentry)) {
                if (dentry->d_inode)
-                        tty = dentry->d_inode->u.generic_ip;
+                        tty = dentry->d_inode->i_private;
                dput(dentry);
        }
diff --git a/fs/dquot.c b/fs/dquot.c
index 0122a279106a..9af789567e51 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -834,6 +834,9 @@ static void print_warning(struct dquot *dquot, const char warntype)
        if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
                return;
+        mutex_lock(&tty_mutex);
+        if (!current->signal->tty)
+                goto out_lock;
        tty_write_message(current->signal->tty, dquot->dq_sb->s_id);
        if (warntype == ISOFTWARN || warntype == BSOFTWARN)
                tty_write_message(current->signal->tty, ": warning, ");
@@ -861,6 +864,8 @@ static void print_warning(struct dquot *dquot, const char warntype)
                        break;
        }
        tty_write_message(current->signal->tty, msg);
+out_lock:
+        mutex_unlock(&tty_mutex);
 }
 static inline void flush_warnings(struct dquot **dquots, char *warntype)
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 8ac2462ae5dd..b3f50651eb6b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -90,8 +90,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(efs_inode_cachep))
+        kmem_cache_destroy(efs_inode_cachep);
-                printk(KERN_INFO "efs_inode_cache: not all structures were freed\n");
 }
 static void efs_put_super(struct super_block *s)
@@ -248,11 +247,10 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
        struct buffer_head *bh;
        struct inode *root;
-        sb = kmalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+        sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
        if (!sb)
                return -ENOMEM;
        s->s_fs_info = sb;
-        memset(sb, 0, sizeof(struct efs_sb_info));
 
        s->s_magic              = EFS_SUPER_MAGIC;
        if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3a3567433b92..8d544334bcd2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1590,7 +1590,6 @@ static struct inode *ep_eventpoll_inode(void)
        inode->i_uid = current->fsuid;
        inode->i_gid = current->fsgid;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blksize = PAGE_SIZE;
        return inode;
 eexit_1:
diff --git a/fs/exec.c b/fs/exec.c
index 54135df2a966..a8efe35176b0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -595,7 +595,7 @@ static int de_thread(struct task_struct *tsk)
        if (!newsighand)
                return -ENOMEM;
-        if (thread_group_empty(current))
+        if (thread_group_empty(tsk))
                goto no_thread_group;
        /*
@@ -620,17 +620,17 @@ static int de_thread(struct task_struct *tsk)
         * Reparenting needs write_lock on tasklist_lock,
         * so it is safe to do it under read_lock.
         */
-        if (unlikely(current->group_leader == child_reaper))
+        if (unlikely(tsk->group_leader == child_reaper))
-                child_reaper = current;
+                child_reaper = tsk;
-        zap_other_threads(current);
+        zap_other_threads(tsk);
        read_unlock(&tasklist_lock);
        /*
         * Account for the thread group leader hanging around:
         */
        count = 1;
-        if (!thread_group_leader(current)) {
+        if (!thread_group_leader(tsk)) {
                count = 2;
                /*
                 * The SIGALRM timer survives the exec, but needs to point
@@ -639,14 +639,14 @@ static int de_thread(struct task_struct *tsk)
                 * synchronize with any firing (by calling del_timer_sync)
                 * before we can safely let the old group leader die.
                 */
-                sig->tsk = current;
+                sig->tsk = tsk;
                spin_unlock_irq(lock);
                if (hrtimer_cancel(&sig->real_timer))
                        hrtimer_restart(&sig->real_timer);
                spin_lock_irq(lock);
        }
        while (atomic_read(&sig->count) > count) {
-                sig->group_exit_task = current;
+                sig->group_exit_task = tsk;
                sig->notify_count = count;
                __set_current_state(TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(lock);
@@ -662,13 +662,13 @@ static int de_thread(struct task_struct *tsk)
         * do is to wait for the thread group leader to become inactive,
         * and to assume its PID:
         */
-        if (!thread_group_leader(current)) {
+        if (!thread_group_leader(tsk)) {
                /*
                 * Wait for the thread group leader to be a zombie.
                 * It should already be zombie at this point, most
                 * of the time.
                 */
-                leader = current->group_leader;
+                leader = tsk->group_leader;
                while (leader->exit_state != EXIT_ZOMBIE)
                        yield();
@@ -682,12 +682,12 @@ static int de_thread(struct task_struct *tsk)
                 * When we take on its identity by switching to its PID, we
                 * also take its birthdate (always earlier than our own).
                 */
-                current->start_time = leader->start_time;
+                tsk->start_time = leader->start_time;
                write_lock_irq(&tasklist_lock);
-                BUG_ON(leader->tgid != current->tgid);
+                BUG_ON(leader->tgid != tsk->tgid);
-                BUG_ON(current->pid == current->tgid);
+                BUG_ON(tsk->pid == tsk->tgid);
                /*
                 * An exec() starts a new thread group with the
                 * TGID of the previous thread group. Rehash the
@@ -696,24 +696,21 @@ static int de_thread(struct task_struct *tsk)
                 */
                /* Become a process group leader with the old leader's pid.
-                 * Note: The old leader also uses thispid until release_task
+                 * The old leader becomes a thread of the this thread group.
+                 * Note: The old leader also uses this pid until release_task
                 *       is called.  Odd but simple and correct.
                 */
-                detach_pid(current, PIDTYPE_PID);
+                detach_pid(tsk, PIDTYPE_PID);
-                current->pid = leader->pid;
+                tsk->pid = leader->pid;
-                attach_pid(current, PIDTYPE_PID,  current->pid);
+                attach_pid(tsk, PIDTYPE_PID,  tsk->pid);
-                attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
+                transfer_pid(leader, tsk, PIDTYPE_PGID);
-                attach_pid(current, PIDTYPE_SID,  current->signal->session);
+                transfer_pid(leader, tsk, PIDTYPE_SID);
-                list_replace_rcu(&leader->tasks, &current->tasks);
+                list_replace_rcu(&leader->tasks, &tsk->tasks);
-                current->group_leader = current;
+                tsk->group_leader = tsk;
-                leader->group_leader = current;
+                leader->group_leader = tsk;
-                /* Reduce leader to a thread */
+                tsk->exit_signal = SIGCHLD;
-                detach_pid(leader, PIDTYPE_PGID);
-                detach_pid(leader, PIDTYPE_SID);
-                current->exit_signal = SIGCHLD;
                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
@@ -753,7 +750,7 @@ no_thread_group:
                spin_lock(&oldsighand->siglock);
                spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING);
-                rcu_assign_pointer(current->sighand, newsighand);
+                rcu_assign_pointer(tsk->sighand, newsighand);
                recalc_sigpending();
                spin_unlock(&newsighand->siglock);
@@ -764,7 +761,7 @@ no_thread_group:
                        kmem_cache_free(sighand_cachep, oldsighand);
        }
-        BUG_ON(!thread_group_leader(current));
+        BUG_ON(!thread_group_leader(tsk));
        return 0;
 }
        
@@ -901,8 +898,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        return 0;
 mmap_failed:
-        put_files_struct(current->files);
+        reset_files_struct(current, files);
-        current->files = files;
 out:
        return retval;
 }
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index da52b4a5db64..7c420b800c34 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -89,8 +89,8 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
        size_t n;
        *size = ext2_acl_size(acl->a_count);
-        ext_acl = (ext2_acl_header *)kmalloc(sizeof(ext2_acl_header) +
+        ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count *
-                acl->a_count * sizeof(ext2_acl_entry), GFP_KERNEL);
+                        sizeof(ext2_acl_entry), GFP_KERNEL);
        if (!ext_acl)
                return ERR_PTR(-ENOMEM);
        ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 695f69ccf908..2cb545bf0f3c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -574,7 +574,6 @@ got:
        inode->i_mode = mode;
        inode->i_ino = ino;
-        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat), not the fs block size */
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        memset(ei->i_data, 0, sizeof(ei->i_data));
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fb4d3220eb8d..dd4e14c221e0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1094,7 +1094,6 @@ void ext2_read_inode (struct inode * inode)
                brelse (bh);
                goto bad_inode;
        }
-        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat), not the fs block size */
        inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 4286ff6330b6..513cd421ac0b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -184,8 +184,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(ext2_inode_cachep))
+        kmem_cache_destroy(ext2_inode_cachep);
-                printk(KERN_INFO "ext2_inode_cache: not all structures were freed\n");
 }
 static void ext2_clear_inode(struct inode *inode)
@@ -544,17 +543,24 @@ static int ext2_check_descriptors (struct super_block * sb)
        int i;
        int desc_block = 0;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        unsigned long first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        unsigned long last_block;
        struct ext2_group_desc * gdp = NULL;
        ext2_debug ("Checking group descriptors");
        for (i = 0; i < sbi->s_groups_count; i++)
        {
+                if (i == sbi->s_groups_count - 1)
+                        last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+                else
+                        last_block = first_block +
+                                (EXT2_BLOCKS_PER_GROUP(sb) - 1);
                if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0)
                        gdp = (struct ext2_group_desc *) sbi->s_group_desc[desc_block++]->b_data;
-                if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
+                if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
-                    le32_to_cpu(gdp->bg_block_bitmap) >= block + EXT2_BLOCKS_PER_GROUP(sb))
+                    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
                {
                        ext2_error (sb, "ext2_check_descriptors",
                                    "Block bitmap for group %d"
@@ -562,8 +568,8 @@ static int ext2_check_descriptors (struct super_block * sb)
                                    i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap));
                        return 0;
                }
-                if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
+                if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
-                    le32_to_cpu(gdp->bg_inode_bitmap) >= block + EXT2_BLOCKS_PER_GROUP(sb))
+                    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
                {
                        ext2_error (sb, "ext2_check_descriptors",
                                    "Inode bitmap for group %d"
@@ -571,9 +577,9 @@ static int ext2_check_descriptors (struct super_block * sb)
                                    i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap));
                        return 0;
                }
-                if (le32_to_cpu(gdp->bg_inode_table) < block ||
+                if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
-                    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
+                    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >
-                    block + EXT2_BLOCKS_PER_GROUP(sb))
+                    last_block)
                {
                        ext2_error (sb, "ext2_check_descriptors",
                                    "Inode table for group %d"
@@ -581,7 +587,7 @@ static int ext2_check_descriptors (struct super_block * sb)
                                    i, (unsigned long) le32_to_cpu(gdp->bg_inode_table));
                        return 0;
                }
-                block += EXT2_BLOCKS_PER_GROUP(sb);
+                first_block += EXT2_BLOCKS_PER_GROUP(sb);
                gdp++;
        }
        return 1;
@@ -648,11 +654,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        int i, j;
        __le32 features;
-        sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(*sbi));
        /*
         * See what the current blocksize for the device is, and
@@ -861,10 +866,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext2;
-        sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+        sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
-                                        le32_to_cpu(es->s_first_data_block) +
+                                le32_to_cpu(es->s_first_data_block) - 1)
-                                       EXT2_BLOCKS_PER_GROUP(sb) - 1) /
+                                        / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
-                                       EXT2_BLOCKS_PER_GROUP(sb);
        db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
                   EXT2_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 86ae8e93adb9..af52a7f8b291 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -521,11 +521,10 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
                }
        } else {
                /* Allocate a buffer where we construct the new block. */
-                header = kmalloc(sb->s_blocksize, GFP_KERNEL);
+                header = kzalloc(sb->s_blocksize, GFP_KERNEL);
                error = -ENOMEM;
                if (header == NULL)
                        goto cleanup;
-                memset(header, 0, sb->s_blocksize);
                end = (char *)header + sb->s_blocksize;
                header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
                header->h_blocks = header->h_refcount = cpu_to_le32(1);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 0d21d558b87a..1e5038d9a01b 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -90,8 +90,8 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
        size_t n;
        *size = ext3_acl_size(acl->a_count);
-        ext_acl = (ext3_acl_header *)kmalloc(sizeof(ext3_acl_header) +
+        ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
-                acl->a_count * sizeof(ext3_acl_entry), GFP_KERNEL);
+                        sizeof(ext3_acl_entry), GFP_KERNEL);
        if (!ext_acl)
                return ERR_PTR(-ENOMEM);
        ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
@@ -258,7 +258,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
                default:
                        return -EINVAL;
        }
-        if (acl) {
+        if (acl) {
                value = ext3_acl_to_disk(acl, &size);
                if (IS_ERR(value))
                        return (int)PTR_ERR(value);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 063d994bda0b..b41a7d7e20f0 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -38,6 +38,13 @@
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/**
+ * ext3_get_group_desc() -- load group descriptor from disk
+ * @sb:                 super block
+ * @block_group:        given block group
+ * @bh:                 pointer to the buffer head to store the block
+ *                      group descriptor
+ */
 struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
                                             unsigned int block_group,
                                             struct buffer_head ** bh)
@@ -73,8 +80,12 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
        return desc + offset;
 }
-/*
+/**
- * Read the bitmap for a given block_group, reading into the specified 
+ * read_block_bitmap()
+ * @sb:                 super block
+ * @block_group:        given block group
+ *
+ * Read the bitmap for a given block_group, reading into the specified
 * slot in the superblock's bitmap cache.
 *
 * Return buffer_head on success or NULL in case of failure.
@@ -103,15 +114,22 @@ error_out:
 * Operations include:
 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
 *
- * We use sorted double linked list for the per-filesystem reservation
+ * We use a red-black tree to represent per-filesystem reservation
- * window list. (like in vm_region).
+ * windows.
+ *
+ */
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:            root of per-filesystem reservation rb tree
+ * @verbose:            verbose mode
+ * @fn:                 function which wishes to dump the reservation map
 *
- * Initially, we keep those small operations in the abstract functions,
+ * If verbose is turned on, it will print the whole block reservation
- * so later if we need a better searching tree than double linked-list,
+ * windows(start, end). Otherwise, it will only print out the "bad" windows,
- * we could easily switch to that without changing too much
+ * those windows that overlap with their immediate neighbors.
- * code.
 */
-#if 0
+#if 1
 static void __rsv_window_dump(struct rb_root *root, int verbose,
                              const char *fn)
 {
@@ -129,7 +147,7 @@ restart:
                rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
                if (verbose)
                        printk("reservation window 0x%p "
-                               "start:  %d, end:  %d\n",
+                               "start:  %lu, end:  %lu\n",
                               rsv, rsv->rsv_start, rsv->rsv_end);
                if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
                        printk("Bad reservation %p (start >= end)\n",
@@ -161,6 +179,22 @@ restart:
 #define rsv_window_dump(root, verbose) do {} while (0)
 #endif
+/**
+ * goal_in_my_reservation()
+ * @rsv:                inode's reservation window
+ * @grp_goal:           given goal block relative to the allocation block group
+ * @group:              the current allocation block group
+ * @sb:                 filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
 static int
 goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
                        unsigned int group, struct super_block * sb)
@@ -168,7 +202,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
        ext3_fsblk_t group_first_block, group_last_block;
        group_first_block = ext3_group_first_block_no(sb, group);
-        group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
+        group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
        if ((rsv->_rsv_start > group_last_block) ||
            (rsv->_rsv_end < group_first_block))
@@ -179,7 +213,11 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
        return 1;
 }
-/*
+/**
+ * search_reserve_window()
+ * @rb_root:            root of reservation tree
+ * @goal:               target allocation block
+ *
 * Find the reserved window which includes the goal, or the previous one
 * if the goal is not in any window.
 * Returns NULL if there are no windows or if all windows start after the goal.
@@ -216,6 +254,13 @@ search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
        return rsv;
 }
+/**
+ * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:                 super block
+ * @rsv:                reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
 void ext3_rsv_window_add(struct super_block *sb,
                    struct ext3_reserve_window_node *rsv)
 {
@@ -236,14 +281,25 @@ void ext3_rsv_window_add(struct super_block *sb,
                        p = &(*p)->rb_left;
                else if (start > this->rsv_end)
                        p = &(*p)->rb_right;
-                else
+                else {
+                        rsv_window_dump(root, 1);
                        BUG();
+                }
        }
        rb_link_node(node, parent, p);
        rb_insert_color(node, root);
 }
+/**
+ * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:                 super block
+ * @rsv:                reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
 static void rsv_window_remove(struct super_block *sb,
                              struct ext3_reserve_window_node *rsv)
 {
@@ -253,11 +309,39 @@ static void rsv_window_remove(struct super_block *sb,
        rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
 }
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:                given reservation window to check
+ *
+ * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
 static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
 {
        /* a valid reservation end block could not be 0 */
-        return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
+        return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 }
+/**
+ * ext3_init_block_alloc_info()
+ * @inode:              file inode structure
+ *
+ * Allocate and initialize the  reservation window structure, and
+ * link the window to the ext3 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext3 inode the first time the open file
+ * needs a new block. So, before every ext3_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext3_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
 void ext3_init_block_alloc_info(struct inode *inode)
 {
        struct ext3_inode_info *ei = EXT3_I(inode);
@@ -271,7 +355,7 @@ void ext3_init_block_alloc_info(struct inode *inode)
                rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
                rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
-                /*
+                /*
                 * if filesystem is mounted with NORESERVATION, the goal
                 * reservation window size is set to zero to indicate
                 * block reservation is off
@@ -287,6 +371,19 @@ void ext3_init_block_alloc_info(struct inode *inode)
        ei->i_block_alloc_info = block_i;
 }
+/**
+ * ext3_discard_reservation()
+ * @inode:              inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ *      ext3_release_file(): last writer close the file
+ *      ext3_clear_inode(): last iput(), when nobody link to this file.
+ *      ext3_truncate(): when the block indirect map is about to change.
+ *
+ */
 void ext3_discard_reservation(struct inode *inode)
 {
        struct ext3_inode_info *ei = EXT3_I(inode);
@@ -306,7 +403,14 @@ void ext3_discard_reservation(struct inode *inode)
        }
 }
-/* Free given blocks, update quota and i_blocks field */
+/**
+ * ext3_free_blocks_sb() -- Free given blocks and update quota
+ * @handle:                     handle to this transaction
+ * @sb:                         super block
+ * @block:                      start physcial block to free
+ * @count:                      number of blocks to free
+ * @pdquot_freed_blocks:        pointer to quota
+ */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
                         ext3_fsblk_t block, unsigned long count,
                         unsigned long *pdquot_freed_blocks)
@@ -419,8 +523,8 @@ do_more:
                }
                /* @@@ This prevents newly-allocated data from being
                 * freed and then reallocated within the same
-                 * transaction. 
+                 * transaction.
-                 * 
+                 *
                 * Ideally we would want to allow that to happen, but to
                 * do so requires making journal_forget() capable of
                 * revoking the queued write of a data block, which
@@ -433,7 +537,7 @@ do_more:
                 * safe not to set the allocation bit in the committed
                 * bitmap, because we know that there is no outstanding
                 * activity on the buffer any more and so it is safe to
-                 * reallocate it.  
+                 * reallocate it.
                 */
                BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
                J_ASSERT_BH(bitmap_bh,
@@ -490,7 +594,13 @@ error_return:
        return;
 }
-/* Free given blocks, update quota and i_blocks field */
+/**
+ * ext3_free_blocks() -- Free given blocks and update quota
+ * @handle:             handle for this transaction
+ * @inode:              inode
+ * @block:              start physical block to free
+ * @count:              number of blocks to count
+ */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
                        ext3_fsblk_t block, unsigned long count)
 {
@@ -508,7 +618,11 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
        return;
 }
-/*
+/**
+ * ext3_test_allocatable()
+ * @nr:                 given allocation block group
+ * @bh:                 bufferhead contains the bitmap of the given block group
+ *
 * For ext3 allocations, we must not reuse any blocks which are
 * allocated in the bitmap buffer's "last committed data" copy.  This
 * prevents deletes from freeing up the page for reuse until we have
@@ -518,7 +632,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 * data would allow the old block to be overwritten before the
 * transaction committed (because we force data to disk before commit).
 * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete. 
+ * data and committing the delete.
 *
 * @@@ We may want to make this allocation behaviour conditional on
 * data-writes at some point, and disable it for metadata allocations or
@@ -541,6 +655,16 @@ static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
        return ret;
 }
+/**
+ * bitmap_search_next_usable_block()
+ * @start:              the starting block (group relative) of the search
+ * @bh:                 bufferhead contains the block group bitmap
+ * @maxblocks:          the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
 static ext3_grpblk_t
 bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
                                        ext3_grpblk_t maxblocks)
@@ -548,11 +672,6 @@ bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
        ext3_grpblk_t next;
        struct journal_head *jh = bh2jh(bh);
-        /*
-         * The bitmap search --- search forward alternately through the actual
-         * bitmap and the last-committed copy until we find a bit free in
-         * both
-         */
        while (start < maxblocks) {
                next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
                if (next >= maxblocks)
@@ -562,14 +681,20 @@ bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
                jbd_lock_bh_state(bh);
                if (jh->b_committed_data)
                        start = ext3_find_next_zero_bit(jh->b_committed_data,
-                                                        maxblocks, next);
+                                                        maxblocks, next);
                jbd_unlock_bh_state(bh);
        }
        return -1;
 }
-/*
+/**
- * Find an allocatable block in a bitmap.  We honour both the bitmap and
+ * find_next_usable_block()
+ * @start:              the starting block (group relative) to find next
+ *                      allocatable block in bitmap.
+ * @bh:                 bufferhead contains the block group bitmap
+ * @maxblocks:          the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We honor both the bitmap and
 * its last-committed copy (if that exists), and perform the "most
 * appropriate allocation" algorithm of looking for a free block near
 * the initial goal; then for a free byte somewhere in the bitmap; then
@@ -584,7 +709,7 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
        if (start > 0) {
                /*
-                 * The goal was occupied; search forward for a free 
+                 * The goal was occupied; search forward for a free
                 * block within the next XX blocks.
                 *
                 * end_goal is more or less random, but it has to be
@@ -620,7 +745,11 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
        return here;
 }
-/*
+/**
+ * claim_block()
+ * @block:              the free block (group relative) to allocate
+ * @bh:                 the bufferhead containts the block group bitmap
+ *
 * We think we can allocate this block in this bitmap.  Try to set the bit.
 * If that succeeds then check that nobody has allocated and then freed the
 * block since we saw that is was not marked in b_committed_data.  If it _was_
@@ -646,7 +775,26 @@ claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
        return ret;
 }
-/*
+/**
+ * ext3_try_to_allocate()
+ * @sb:                 superblock
+ * @handle:             handle to this transaction
+ * @group:              given allocation block group
+ * @bitmap_bh:          bufferhead holds the block bitmap
+ * @grp_goal:           given target block within the group
+ * @count:              target number of blocks to allocate
+ * @my_rsv:             reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ *      if there is a reservation window, only try to allocate block(s) from the
+ *      file's own reservation window;
+ *      Otherwise, the allocation range starts from the give goal block, ends at
+ *      the block group's last block.
+ *
 * If we failed to allocate the desired block then we may end up crossing to a
 * new bitmap.  In that case we must release write access to the old one via
 * ext3_journal_release_buffer(), else we'll run out of credits.
@@ -703,7 +851,8 @@ repeat:
        }
        start = grp_goal;
-        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
+        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+                grp_goal, bitmap_bh)) {
                /*
                 * The block was allocated by another thread, or it was
                 * allocated and then freed by another thread
@@ -718,7 +867,8 @@ repeat:
        grp_goal++;
        while (num < *count && grp_goal < end
                && ext3_test_allocatable(grp_goal, bitmap_bh)
-                && claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
+                && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+                                grp_goal, bitmap_bh)) {
                num++;
                grp_goal++;
        }
@@ -730,12 +880,12 @@ fail_access:
 }
 /**
- *      find_next_reservable_window():
+ *      find_next_reservable_window():
 *              find a reservable space within the given range.
 *              It does not allocate the reservation window for now:
 *              alloc_new_reservation() will do the work later.
 *
- *      @search_head: the head of the searching list;
+ *      @search_head: the head of the searching list;
 *              This is not necessarily the list head of the whole filesystem
 *
 *              We have both head and start_block to assist the search
@@ -743,12 +893,12 @@ fail_access:
 *              but we will shift to the place where start_block is,
 *              then start from there, when looking for a reservable space.
 *
- *      @size: the target new reservation window size
+ *      @size: the target new reservation window size
 *
- *      @group_first_block: the first block we consider to start
+ *      @group_first_block: the first block we consider to start
 *                      the real search from
 *
- *      @last_block:
+ *      @last_block:
 *              the maximum block number that our goal reservable space
 *              could start from. This is normally the last block in this
 *              group. The search will end when we found the start of next
@@ -756,10 +906,10 @@ fail_access:
 *              This could handle the cross boundary reservation window
 *              request.
 *
- *      basically we search from the given range, rather than the whole
+ *      basically we search from the given range, rather than the whole
- *      reservation double linked list, (start_block, last_block)
+ *      reservation double linked list, (start_block, last_block)
- *      to find a free region that is of my size and has not
+ *      to find a free region that is of my size and has not
- *      been reserved.
+ *      been reserved.
 *
 */
 static int find_next_reservable_window(
@@ -812,7 +962,7 @@ static int find_next_reservable_window(
                        /*
                         * Found a reserveable space big enough.  We could
                         * have a reservation across the group boundary here
-                         */
+                         */
                        break;
                }
        }
@@ -848,7 +998,7 @@ static int find_next_reservable_window(
 }
 /**
- *      alloc_new_reservation()--allocate a new reservation window
+ *      alloc_new_reservation()--allocate a new reservation window
 *
 *              To make a new reservation, we search part of the filesystem
 *              reservation list (the list that inside the group). We try to
@@ -897,7 +1047,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
        spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
        group_first_block = ext3_group_first_block_no(sb, group);
-        group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
+        group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
        if (grp_goal < 0)
                start_block = group_first_block;
@@ -929,9 +1079,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
                if ((my_rsv->rsv_alloc_hit >
                     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
                        /*
-                         * if we previously allocation hit ration is greater than half
+                         * if the previously allocation hit ratio is
-                         * we double the size of reservation window next time
+                         * greater than 1/2, then we double the size of
-                         * otherwise keep the same
+                         * the reservation window the next time,
+                         * otherwise we keep the same size window
                         */
                        size = size * 2;
                        if (size > EXT3_MAX_RESERVE_BLOCKS)
@@ -1010,6 +1161,23 @@ retry:
        goto retry;
 }
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:             given reservation window
+ * @sb:                 super block
+ * @size:               the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext3_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext3_new_blocks() tries to allocate blocks,
+ */
 static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
                        struct super_block *sb, int size)
 {
@@ -1035,7 +1203,17 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
        spin_unlock(rsv_lock);
 }
-/*
+/**
+ * ext3_try_to_allocate_with_rsv()
+ * @sb:                 superblock
+ * @handle:             handle to this transaction
+ * @group:              given allocation block group
+ * @bitmap_bh:          bufferhead holds the block bitmap
+ * @grp_goal:           given target block within the group
+ * @count:              target number of blocks to allocate
+ * @my_rsv:             reservation window
+ * @errp:               pointer to store the error code
+ *
 * This is the main function used to allocate a new block and its reservation
 * window.
 *
@@ -1051,9 +1229,7 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 * reservation), and there are lots of free blocks, but they are all
 * being reserved.
 *
- * We use a sorted double linked list for the per-filesystem reservation list.
+ * We use a red-black tree for the per-filesystem reservation list.
- * The insert, remove and find a free space(non-reserved) operations for the
- * sorted double linked list should be fast.
 *
 */
 static ext3_grpblk_t
@@ -1063,7 +1239,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
                        struct ext3_reserve_window_node * my_rsv,
                        unsigned long *count, int *errp)
 {
-        ext3_fsblk_t group_first_block;
+        ext3_fsblk_t group_first_block, group_last_block;
        ext3_grpblk_t ret = 0;
        int fatal;
        unsigned long num = *count;
@@ -1100,6 +1276,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
         * first block is the block number of the first block in this group
         */
        group_first_block = ext3_group_first_block_no(sb, group);
+        group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
        /*
         * Basically we will allocate a new block from inode's reservation
@@ -1118,7 +1295,8 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
         */
        while (1) {
                if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-                        !goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
+                        !goal_in_my_reservation(&my_rsv->rsv_window,
+                                                grp_goal, group, sb)) {
                        if (my_rsv->rsv_goal_size < *count)
                                my_rsv->rsv_goal_size = *count;
                        ret = alloc_new_reservation(my_rsv, grp_goal, sb,
@@ -1126,17 +1304,21 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
                        if (ret < 0)
                                break;                  /* failed */
-                        if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
+                        if (!goal_in_my_reservation(&my_rsv->rsv_window,
+                                                        grp_goal, group, sb))
                                grp_goal = -1;
-                } else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
+                } else if (grp_goal > 0 &&
+                          (my_rsv->rsv_end-grp_goal+1) < *count)
                        try_to_extend_reservation(my_rsv, sb,
                                        *count-my_rsv->rsv_end + grp_goal - 1);
-                if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
+                if ((my_rsv->rsv_start > group_last_block) ||
-                    || (my_rsv->rsv_end < group_first_block))
+                                (my_rsv->rsv_end < group_first_block)) {
+                        rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
                        BUG();
-                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
+                }
-                                           &num, &my_rsv->rsv_window);
+                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+                                           grp_goal, &num, &my_rsv->rsv_window);
                if (ret >= 0) {
                        my_rsv->rsv_alloc_hit += num;
                        *count = num;
@@ -1161,6 +1343,12 @@ out:
        return ret;
 }
+/**
+ * ext3_has_free_blocks()
+ * @sbi:                in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
        ext3_fsblk_t free_blocks, root_blocks;
@@ -1175,11 +1363,17 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
        return 1;
 }
-/*
+/**
+ * ext3_should_retry_alloc()
+ * @sb:                 super block
+ * @retries             number of attemps has been made
+ *
 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
 * for the current or commiting transaction to complete, and then
 * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
 */
 int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 {
@@ -1191,13 +1385,19 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
        return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
 }
-/*
+/**
- * ext3_new_block uses a goal block to assist allocation.  If the goal is
+ * ext3_new_blocks() -- core block(s) allocation function
- * free, or there is a free block within 32 blocks of the goal, that block
+ * @handle:             handle to this transaction
- * is allocated.  Otherwise a forward search is made for a free block; within 
+ * @inode:              file inode
- * each block group the search first looks for an entire free byte in the block
+ * @goal:               given target block(filesystem wide)
- * bitmap, and then for any free bit if that fails.
+ * @count:              target number of blocks to allocate
- * This function also updates quota and i_blocks field.
+ * @errp:               error code
+ *
+ * ext3_new_blocks uses a goal block to assist allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
 */
 ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
                        ext3_fsblk_t goal, unsigned long *count, int *errp)
@@ -1303,7 +1503,7 @@ retry_alloc:
        smp_rmb();
        /*
-         * Now search the rest of the groups.  We assume that 
+         * Now search the rest of the groups.  We assume that
         * i and gdp correctly point to the last group visited.
         */
        for (bgi = 0; bgi < ngroups; bgi++) {
@@ -1428,7 +1628,7 @@ allocated:
        spin_lock(sb_bgl_lock(sbi, group_no));
        gdp->bg_free_blocks_count =
-                        cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - num);
+                        cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
        spin_unlock(sb_bgl_lock(sbi, group_no));
        percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
@@ -1471,6 +1671,12 @@ ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
        return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
+/**
+ * ext3_count_free_blocks() -- count filesystem free blocks
+ * @sb:         superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
 ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
        ext3_fsblk_t desc_count;
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index ce4f82b9e528..b9176eed98d1 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -20,7 +20,7 @@ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
        unsigned int i;
        unsigned long sum = 0;
-        if (!map) 
+        if (!map)
                return (0);
        for (i = 0; i < numchars; i++)
                sum += nibblemap[map->b_data[i] & 0xf] +
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index fbb0d4ed07d4..429acbb4e064 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -59,7 +59,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
        return (ext3_filetype_table[filetype]);
 }
-                               
 int ext3_check_dir_entry (const char * function, struct inode * dir,
                          struct ext3_dir_entry_2 * de,
@@ -67,7 +67,7 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
                          unsigned long offset)
 {
        const char * error_msg = NULL;
-        const int rlen = le16_to_cpu(de->rec_len);
+        const int rlen = le16_to_cpu(de->rec_len);
        if (rlen < EXT3_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -162,7 +162,7 @@ revalidate:
                 * to make sure. */
                if (filp->f_version != inode->i_version) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
-                                de = (struct ext3_dir_entry_2 *) 
+                                de = (struct ext3_dir_entry_2 *)
                                        (bh->b_data + i);
                                /* It's too expensive to do a full
                                 * dirent test each time round this
@@ -181,7 +181,7 @@ revalidate:
                        filp->f_version = inode->i_version;
                }
-                while (!error && filp->f_pos < inode->i_size 
+                while (!error && filp->f_pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
                        if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
@@ -229,7 +229,7 @@ out:
 /*
 * These functions convert from the major/minor hash to an f_pos
 * value.
- * 
+ *
 * Currently we only use major hash numer.  This is unfortunate, but
 * on 32-bit machines, the same VFS interface is used for lseek and
 * llseek, so if we use the 64 bit offset, then the 32-bit versions of
@@ -250,7 +250,7 @@ out:
 struct fname {
        __u32           hash;
        __u32           minor_hash;
-        struct rb_node  rb_hash; 
+        struct rb_node  rb_hash;
        struct fname    *next;
        __u32           inode;
        __u8            name_len;
@@ -343,10 +343,9 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
        /* Create and allocate the fname structure */
        len = sizeof(struct fname) + dirent->name_len + 1;
-        new_fn = kmalloc(len, GFP_KERNEL);
+        new_fn = kzalloc(len, GFP_KERNEL);
        if (!new_fn)
                return -ENOMEM;
-        memset(new_fn, 0, len);
        new_fn->hash = hash;
        new_fn->minor_hash = minor_hash;
        new_fn->inode = le32_to_cpu(dirent->inode);
@@ -410,7 +409,7 @@ static int call_filldir(struct file * filp, void * dirent,
        curr_pos = hash2pos(fname->hash, fname->minor_hash);
        while (fname) {
                error = filldir(dirent, fname->name,
-                                fname->name_len, curr_pos, 
+                                fname->name_len, curr_pos,
                                fname->inode,
                                get_dtype(sb, fname->file_type));
                if (error) {
@@ -465,7 +464,7 @@ static int ext3_dx_readdir(struct file * filp,
                /*
                 * Fill the rbtree if we have no more entries,
                 * or the inode has changed since we last read in the
-                 * cached entries. 
+                 * cached entries.
                 */
                if ((!info->curr_node) ||
                    (filp->f_version != inode->i_version)) {
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 1efefb630ea9..994efd189f4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -100,7 +100,7 @@ ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 force_commit:
        err = ext3_force_commit(inode->i_sb);
-        if (err) 
+        if (err)
                return err;
        return ret;
 }
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 49382a208e05..dd1fd3c0fc05 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -8,14 +8,14 @@
 *                      Universite Pierre et Marie Curie (Paris VI)
 *  from
 *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
+ *
 *  ext3fs fsync primitive
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
+ *
 *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
+ *  and excessive __inline__s.
 *        Andi Kleen, 1997
 *
 * Major simplications and cleanup - we only need to do the metadata, because
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index 5a2d1235ead0..deeb27b5ba83 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -4,7 +4,7 @@
 * Copyright (C) 2002 by Theodore Ts'o
 *
 * This file is released under the GPL v2.
- * 
+ *
 * This file may be redistributed under the terms of the GNU Public
 * License.
 */
@@ -80,11 +80,11 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 * Returns the hash of a filename.  If len is 0 and name is NULL, then
 * this function can be used to test whether or not a hash version is
 * supported.
- * 
+ *
 * The seed is an 4 longword (32 bits) "secret" which can be used to
 * uniquify a hash.  If the seed is all zero's, then some default seed
 * may be used.
- * 
+ *
 * A particular hash version specifies whether or not the seed is
 * represented, and whether or not the returned hash is 32 bits or 64
 * bits.  32 bit hashes will return 0 for the minor hash.
@@ -95,7 +95,7 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        __u32   minor_hash = 0;
        const char      *p;
        int             i;
-        __u32           in[8], buf[4];
+        __u32           in[8], buf[4];
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 36546ed36a14..e45dbd651736 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -202,7 +202,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent)
 {
        int ngroups = EXT3_SB(sb)->s_groups_count;
-        int freei, avefreei;
+        unsigned int freei, avefreei;
        struct ext3_group_desc *desc, *best_desc = NULL;
        struct buffer_head *bh;
        int group, best_group = -1;
@@ -216,7 +216,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
                        continue;
                if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
                        continue;
-                if (!best_desc || 
+                if (!best_desc ||
                    (le16_to_cpu(desc->bg_free_blocks_count) >
                     le16_to_cpu(best_desc->bg_free_blocks_count))) {
                        best_group = group;
@@ -226,30 +226,30 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
        return best_group;
 }
-/* 
+/*
- * Orlov's allocator for directories. 
+ * Orlov's allocator for directories.
- * 
+ *
 * We always try to spread first-level directories.
 *
- * If there are blockgroups with both free inodes and free blocks counts 
+ * If there are blockgroups with both free inodes and free blocks counts
- * not worse than average we return one with smallest directory count. 
+ * not worse than average we return one with smallest directory count.
- * Otherwise we simply return a random group. 
+ * Otherwise we simply return a random group.
- * 
+ *
- * For the rest rules look so: 
+ * For the rest rules look so:
- * 
+ *
- * It's OK to put directory into a group unless 
+ * It's OK to put directory into a group unless
- * it has too many directories already (max_dirs) or 
+ * it has too many directories already (max_dirs) or
- * it has too few free inodes left (min_inodes) or 
+ * it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or 
+ * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt). 
+ * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these 
+ * Parent's group is prefered, if it doesn't satisfy these
- * conditions we search cyclically through the rest. If none 
+ * conditions we search cyclically through the rest. If none
- * of the groups look good we just look for a group with more 
+ * of the groups look good we just look for a group with more
- * free inodes than average (starting at parent's group). 
+ * free inodes than average (starting at parent's group).
- * 
+ *
- * Debt is incremented each time we allocate a directory and decremented 
+ * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255. 
+ * when we allocate an inode, within 0--255.
- */ 
+ */
 #define INODE_COST 64
 #define BLOCK_COST 256
@@ -261,10 +261,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
        struct ext3_super_block *es = sbi->s_es;
        int ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
-        int freei, avefreei;
+        unsigned int freei, avefreei;
        ext3_fsblk_t freeb, avefreeb;
        ext3_fsblk_t blocks_per_dir;
-        int ndirs;
+        unsigned int ndirs;
        int max_debt, max_dirs, min_inodes;
        ext3_grpblk_t min_blocks;
        int group = -1, i;
@@ -454,7 +454,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
                        group = find_group_dir(sb, dir);
                else
                        group = find_group_orlov(sb, dir);
-        } else 
+        } else
                group = find_group_other(sb, dir);
        err = -ENOSPC;
@@ -559,7 +559,6 @@ got:
        inode->i_ino = ino;
        /* This is the optimal IO size (for stat), not the fs block size */
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 84be02e93652..dcf4f1dd108b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -13,11 +13,11 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Goal-directed block allocation by Stephen Tweedie
- *      (sct@redhat.com), 1993, 1998
+ *      (sct@redhat.com), 1993, 1998
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
- *      (jj@sunsite.ms.mff.cuni.cz)
+ *      (jj@sunsite.ms.mff.cuni.cz)
 *
 *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
 */
@@ -55,7 +55,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
 /*
 * The ext3 forget function must perform a revoke if we are freeing data
 * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases. 
+ * revoked in all cases.
 *
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
@@ -105,7 +105,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
-static unsigned long blocks_for_truncate(struct inode *inode) 
+static unsigned long blocks_for_truncate(struct inode *inode)
 {
        unsigned long needed;
@@ -122,13 +122,13 @@ static unsigned long blocks_for_truncate(struct inode *inode)
        /* But we need to bound the transaction so we don't overflow the
         * journal. */
-        if (needed > EXT3_MAX_TRANS_DATA) 
+        if (needed > EXT3_MAX_TRANS_DATA)
                needed = EXT3_MAX_TRANS_DATA;
        return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
-/* 
+/*
 * Truncate transactions can be complex and absolutely huge.  So we need to
 * be able to restart the transaction at a conventient checkpoint to make
 * sure we don't overflow the journal.
@@ -136,9 +136,9 @@ static unsigned long blocks_for_truncate(struct inode *inode)
 * start_transaction gets us a new handle for a truncate transaction,
 * and extend_transaction tries to extend the existing one a bit.  If
 * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct 
+ * transaction in the top-level truncate loop. --sct
 */
-static handle_t *start_transaction(struct inode *inode) 
+static handle_t *start_transaction(struct inode *inode)
 {
        handle_t *result;
@@ -215,12 +215,12 @@ void ext3_delete_inode (struct inode * inode)
        ext3_orphan_del(handle, inode);
        EXT3_I(inode)->i_dtime  = get_seconds();
-        /* 
+        /*
         * One subtle ordering requirement: if anything has gone wrong
         * (transaction abort, IO errors, whatever), then we can still
         * do these next steps (the fs will already have been marked as
         * having errors), but we can't free the inode if the mark_dirty
-         * fails.  
+         * fails.
         */
        if (ext3_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
@@ -398,7 +398,7 @@ no_block:
 *        + if there is a block to the left of our position - allocate near it.
 *        + if pointer will live in indirect block - allocate near that block.
 *        + if pointer will live in inode - allocate in the same
- *          cylinder group. 
+ *          cylinder group.
 *
 * In the latter case we colour the starting block by the callers PID to
 * prevent it from clashing with concurrent allocations for a different inode
@@ -470,7 +470,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 *      ext3_blks_to_allocate: Look up the block map and count the number
 *      of direct blocks need to be allocated for the given branch.
 *
- *      @branch: chain of indirect blocks
+ *      @branch: chain of indirect blocks
 *      @k: number of blocks need for indirect blocks
 *      @blks: number of data blocks to be mapped.
 *      @blocks_to_boundary:  the offset in the indirect block
@@ -744,7 +744,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
                jbd_debug(5, "splicing indirect only\n");
                BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
                err = ext3_journal_dirty_metadata(handle, where->bh);
-                if (err) 
+                if (err)
                        goto err_out;
        } else {
                /*
@@ -1098,7 +1098,7 @@ static int walk_page_buffers(	handle_t *handle,
        for (   bh = head, block_start = 0;
                ret == 0 && (bh != head || !block_start);
-                block_start = block_end, bh = next)
+                block_start = block_end, bh = next)
        {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
@@ -1137,7 +1137,7 @@ static int walk_page_buffers(	handle_t *handle,
 * So what we do is to rely on the fact that journal_stop/journal_start
 * will _not_ run commit under these circumstances because handle->h_ref
 * is elevated.  We'll still have enough credits for the tiny quotafile
- * write.  
+ * write.
 */
 static int do_journal_get_write_access(handle_t *handle,
                                        struct buffer_head *bh)
@@ -1282,7 +1282,7 @@ static int ext3_journalled_commit_write(struct file *file,
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
                EXT3_I(inode)->i_disksize = inode->i_size;
                ret2 = ext3_mark_inode_dirty(handle, inode);
-                if (!ret) 
+                if (!ret)
                        ret = ret2;
        }
        ret2 = ext3_journal_stop(handle);
@@ -1291,7 +1291,7 @@ static int ext3_journalled_commit_write(struct file *file,
        return ret;
 }
-/* 
+/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
@@ -1300,10 +1300,10 @@ static int ext3_journalled_commit_write(struct file *file,
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache. 
+ * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache. 
+ * take extra steps to flush any blocks which might be in the cache.
 */
 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 {
@@ -1312,16 +1312,16 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
        int err;
        if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
-                /* 
+                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
                 * only if we run lilo or swapon on a freshly made file
-                 * do we expect this to happen. 
+                 * do we expect this to happen.
                 *
                 * (bmap requires CAP_SYS_RAWIO so this does not
                 * represent an unprivileged user DOS attack --- we'd be
                 * in trouble if mortal users could trigger this path at
-                 * will.) 
+                 * will.)
                 *
                 * NB. EXT3_STATE_JDATA is not set on files other than
                 * regular files.  If somebody wants to bmap a directory
@@ -1457,7 +1457,7 @@ static int ext3_ordered_writepage(struct page *page,
         */
        /*
-         * And attach them to the current transaction.  But only if 
+         * And attach them to the current transaction.  But only if
         * block_write_full_page() succeeded.  Otherwise they are unmapped,
         * and generally junk.
         */
@@ -1644,7 +1644,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
                }
        }
-        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext3_get_block, NULL);
@@ -2025,7 +2025,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
                           __le32 *first, __le32 *last)
 {
        ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
-        unsigned long count = 0;            /* Number of blocks in the run */ 
+        unsigned long count = 0;            /* Number of blocks in the run */
        __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
                                               corresponding to
                                               block_to_free */
@@ -2054,7 +2054,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                ext3_clear_blocks(handle, inode, this_bh, 
+                                ext3_clear_blocks(handle, inode, this_bh,
                                                  block_to_free,
                                                  count, block_to_free_p, p);
                                block_to_free = nr;
@@ -2115,7 +2115,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                         */
                        if (!bh) {
                                ext3_error(inode->i_sb, "ext3_free_branches",
-                                           "Read failure, inode=%ld, block="E3FSBLK,
+                                           "Read failure, inode=%lu, block="E3FSBLK,
                                           inode->i_ino, nr);
                                continue;
                        }
@@ -2184,7 +2184,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                                        *p = 0;
                                        BUFFER_TRACE(parent_bh,
                                        "call ext3_journal_dirty_metadata");
-                                        ext3_journal_dirty_metadata(handle, 
+                                        ext3_journal_dirty_metadata(handle,
                                                                    parent_bh);
                                }
                        }
@@ -2632,9 +2632,6 @@ void ext3_read_inode(struct inode * inode)
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those. */
        }
-        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
-                                         * (for stat), not the fs block
-                                         * size */  
        inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 #ifdef EXT3_FRAGMENTS
@@ -2704,7 +2701,7 @@ void ext3_read_inode(struct inode * inode)
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
-                else 
+                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        }
@@ -2724,8 +2721,8 @@ bad_inode:
 *
 * The caller must have write access to iloc->bh.
 */
-static int ext3_do_update_inode(handle_t *handle, 
+static int ext3_do_update_inode(handle_t *handle,
-                                struct inode *inode, 
+                                struct inode *inode,
                                struct ext3_iloc *iloc)
 {
        struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
@@ -2900,7 +2897,7 @@ int ext3_write_inode(struct inode *inode, int wait)
 * commit will leave the blocks being flushed in an unused state on
 * disk.  (On recovery, the inode will get truncated and the blocks will
 * be freed, so we have a strong guarantee that no future commit will
- * leave these blocks visible to the user.)  
+ * leave these blocks visible to the user.)
 *
 * Called with inode->sem down.
 */
@@ -3043,13 +3040,13 @@ int ext3_mark_iloc_dirty(handle_t *handle,
        return err;
 }
-/* 
+/*
 * On success, We end up with an outstanding reference count against
- * iloc->bh.  This _must_ be cleaned up later. 
+ * iloc->bh.  This _must_ be cleaned up later.
 */
 int
-ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
+ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext3_iloc *iloc)
 {
        int err = 0;
@@ -3139,7 +3136,7 @@ out:
 }
 #if 0
-/* 
+/*
 * Bind an inode's backing buffer_head into this transaction, to prevent
 * it from being flushed to disk early.  Unlike
 * ext3_reserve_inode_write, this leaves behind no bh reference and
@@ -3157,7 +3154,7 @@ static int ext3_pin_inode(handle_t *handle, struct inode *inode)
                        BUFFER_TRACE(iloc.bh, "get_write_access");
                        err = journal_get_write_access(handle, iloc.bh);
                        if (!err)
-                                err = ext3_journal_dirty_metadata(handle, 
+                                err = ext3_journal_dirty_metadata(handle,
                                                                  iloc.bh);
                        brelse(iloc.bh);
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2aa7101b27cd..85d132c37ee0 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -15,13 +15,13 @@
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  Directory entry file type support and forward compatibility hooks
- *      for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *      for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
 *  Hash Tree Directory indexing (c)
- *      Daniel Phillips, 2001
+ *      Daniel Phillips, 2001
 *  Hash Tree Directory indexing porting
- *      Christopher Li, 2002
+ *      Christopher Li, 2002
 *  Hash Tree Directory indexing cleanup
- *      Theodore Ts'o, 2002
+ *      Theodore Ts'o, 2002
 */
 #include <linux/fs.h>
@@ -76,7 +76,7 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
-#define dxtrace(command) 
+#define dxtrace(command)
 #endif
 struct fake_dirent
@@ -169,7 +169,7 @@ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
 static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
-                                 struct dx_frame *frames, 
+                                 struct dx_frame *frames,
                                 __u32 *start_hash);
 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
                       struct ext3_dir_entry_2 **res_dir, int *err);
@@ -250,7 +250,7 @@ static void dx_show_index (char * label, struct dx_entry *entries)
 }
 struct stats
-{ 
+{
        unsigned names;
        unsigned space;
        unsigned bcount;
@@ -278,7 +278,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
                                       ((char *) de - base));
                        }
                        space += EXT3_DIR_REC_LEN(de->name_len);
-                        names++;
+                        names++;
                }
                de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
        }
@@ -464,7 +464,7 @@ static void dx_release (struct dx_frame *frames)
 */
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
-                                 struct dx_frame *frames, 
+                                 struct dx_frame *frames,
                                 __u32 *start_hash)
 {
        struct dx_frame *p;
@@ -632,7 +632,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                }
                count += ret;
                hashval = ~0;
-                ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, 
+                ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
                                            frame, frames, &hashval);
                *next_hash = hashval;
                if (ret < 0) {
@@ -649,7 +649,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                        break;
        }
        dx_release(frames);
-        dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 
+        dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
                       count, *next_hash));
        return count;
 errout:
@@ -1050,7 +1050,7 @@ struct dentry *ext3_get_parent(struct dentry *child)
                parent = ERR_PTR(-ENOMEM);
        }
        return parent;
-} 
+}
 #define S_SHIFT 12
 static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -1198,7 +1198,7 @@ errout:
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
- * 
+ *
 * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
 * all other cases bh is released.
 */
@@ -1572,7 +1572,7 @@ cleanup:
 * ext3_delete_entry deletes a directory entry by merging it with the
 * previous entry
 */
-static int ext3_delete_entry (handle_t *handle, 
+static int ext3_delete_entry (handle_t *handle,
                              struct inode * dir,
                              struct ext3_dir_entry_2 * de_del,
                              struct buffer_head * bh)
@@ -1643,12 +1643,12 @@ static int ext3_add_nondir(handle_t *handle,
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
- * with d_instantiate(). 
+ * with d_instantiate().
 */
 static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
                struct nameidata *nd)
 {
-        handle_t *handle; 
+        handle_t *handle;
        struct inode * inode;
        int err, retries = 0;
@@ -1688,7 +1688,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1813,10 +1813,10 @@ static int empty_dir (struct inode * inode)
        de1 = (struct ext3_dir_entry_2 *)
                        ((char *) de + le16_to_cpu(de->rec_len));
        if (le32_to_cpu(de->inode) != inode->i_ino ||
-                        !le32_to_cpu(de1->inode) || 
+                        !le32_to_cpu(de1->inode) ||
                        strcmp (".", de->name) ||
                        strcmp ("..", de1->name)) {
-                ext3_warning (inode->i_sb, "empty_dir",
+                ext3_warning (inode->i_sb, "empty_dir",
                              "bad directory (dir #%lu) - no `.' or `..'",
                              inode->i_ino);
                brelse (bh);
@@ -1883,7 +1883,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
         * being truncated, or files being unlinked. */
        /* @@@ FIXME: Observation from aviro:
-         * I think I can trigger J_ASSERT in ext3_orphan_add().  We block 
+         * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
         * here (on lock_super()), so race with ext3_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
@@ -1919,8 +1919,8 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        if (!err)
                list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-        jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+        jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
-        jbd_debug(4, "orphan inode %ld will point to %d\n",
+        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
        unlock_super(sb);
@@ -2129,7 +2129,7 @@ static int ext3_symlink (struct inode * dir,
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2227,7 +2227,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                DQUOT_INIT(new_dentry->d_inode);
        handle = ext3_journal_start(old_dir, 2 *
                                        EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
-                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
+                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -2393,4 +2393,4 @@ struct inode_operations ext3_special_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .permission     = ext3_permission,
-}; 
+};
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 5e1337fd878a..b73cba12f79c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -336,7 +336,7 @@ static int verify_reserved_gdb(struct super_block *sb,
        unsigned five = 5;
        unsigned seven = 7;
        unsigned grp;
-        __u32 *p = (__u32 *)primary->b_data;
+        __le32 *p = (__le32 *)primary->b_data;
        int gdbackups = 0;
        while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
@@ -380,7 +380,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        struct buffer_head *dind;
        int gdbackups;
        struct ext3_iloc iloc;
-        __u32 *data;
+        __le32 *data;
        int err;
        if (test_opt(sb, DEBUG))
@@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                goto exit_bh;
        }
-        data = (__u32 *)dind->b_data;
+        data = (__le32 *)dind->b_data;
        if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
                ext3_warning(sb, __FUNCTION__,
                             "new group %u GDT block "E3FSBLK" not reserved",
@@ -439,8 +439,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
                goto exit_dindj;
-        n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) *
+        n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
-                                sizeof(struct buffer_head *), GFP_KERNEL);
+                        GFP_KERNEL);
        if (!n_group_desc) {
                err = -ENOMEM;
                ext3_warning (sb, __FUNCTION__,
@@ -519,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        struct buffer_head *dind;
        struct ext3_iloc iloc;
        ext3_fsblk_t blk;
-        __u32 *data, *end;
+        __le32 *data, *end;
        int gdbackups = 0;
        int res, i;
        int err;
@@ -536,8 +536,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        }
        blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
-        data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
+        data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
-        end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
+        end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
        /* Get each reserved primary GDT block and verify it holds backups */
        for (res = 0; res < reserved_gdb; res++, blk++) {
@@ -545,7 +545,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                        ext3_warning(sb, __FUNCTION__,
                                     "reserved block "E3FSBLK
                                     " not at offset %ld",
-                                     blk, (long)(data - (__u32 *)dind->b_data));
+                                     blk,
+                                     (long)(data - (__le32 *)dind->b_data));
                        err = -EINVAL;
                        goto exit_bh;
                }
@@ -560,7 +561,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                        goto exit_bh;
                }
                if (++data >= end)
-                        data = (__u32 *)dind->b_data;
+                        data = (__le32 *)dind->b_data;
        }
        for (i = 0; i < reserved_gdb; i++) {
@@ -584,7 +585,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
        blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
        for (i = 0; i < reserved_gdb; i++) {
                int err2;
-                data = (__u32 *)primary[i]->b_data;
+                data = (__le32 *)primary[i]->b_data;
                /* printk("reserving backup %lu[%u] = %lu\n",
                       primary[i]->b_blocknr, gdbackups,
                       blk + primary[i]->b_blocknr); */
@@ -689,7 +690,7 @@ exit_err:
                             "can't update backup for group %d (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT3_VALID_FS;
-                sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
+                sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
                mark_buffer_dirty(sbi->s_sbh);
        }
 }
@@ -730,6 +731,18 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
                return -EPERM;
        }
+        if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
+            le32_to_cpu(es->s_blocks_count)) {
+                ext3_warning(sb, __FUNCTION__, "blocks_count overflow\n");
+                return -EINVAL;
+        }
+        if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
+            le32_to_cpu(es->s_inodes_count)) {
+                ext3_warning(sb, __FUNCTION__, "inodes_count overflow\n");
+                return -EINVAL;
+        }
        if (reserved_gdb || gdb_off == 0) {
                if (!EXT3_HAS_COMPAT_FEATURE(sb,
                                             EXT3_FEATURE_COMPAT_RESIZE_INODE)){
@@ -958,6 +971,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        add = EXT3_BLOCKS_PER_GROUP(sb) - last;
+        if (o_blocks_count + add < o_blocks_count) {
+                ext3_warning(sb, __FUNCTION__, "blocks_count overflow");
+                return -EINVAL;
+        }
        if (o_blocks_count + add > n_blocks_count)
                add = n_blocks_count - o_blocks_count;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3559086eee5f..8bfd56ef18ca 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -45,7 +45,7 @@
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
                             unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
-                               int);
+                               unsigned int);
 static void ext3_commit_super (struct super_block * sb,
                               struct ext3_super_block * es,
                               int sync);
@@ -62,13 +62,13 @@ static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
-/* 
+/*
 * Wrappers for journal_start/end.
 *
 * The only special thing we need to do here is to make sure that all
 * journal_end calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
- * appropriate. 
+ * appropriate.
 */
 handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
 {
@@ -90,11 +90,11 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
        return journal_start(journal, nblocks);
 }
-/* 
+/*
 * The only special thing we need to do here is to make sure that all
 * journal_stop calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
- * appropriate. 
+ * appropriate.
 */
 int __ext3_journal_stop(const char *where, handle_t *handle)
 {
@@ -159,20 +159,21 @@ static void ext3_handle_error(struct super_block *sb)
        if (sb->s_flags & MS_RDONLY)
                return;
-        if (test_opt (sb, ERRORS_RO)) {
+        if (!test_opt (sb, ERRORS_CONT)) {
-                printk (KERN_CRIT "Remounting filesystem read-only\n");
-                sb->s_flags |= MS_RDONLY;
-        } else {
                journal_t *journal = EXT3_SB(sb)->s_journal;
                EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
                if (journal)
                        journal_abort(journal, -EIO);
        }
+        if (test_opt (sb, ERRORS_RO)) {
+                printk (KERN_CRIT "Remounting filesystem read-only\n");
+                sb->s_flags |= MS_RDONLY;
+        }
+        ext3_commit_super(sb, es, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT3-fs (device %s): panic forced after error\n",
                        sb->s_id);
-        ext3_commit_super(sb, es, 1);
 }
 void ext3_error (struct super_block * sb, const char * function,
@@ -369,16 +370,16 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
 {
        struct list_head *l;
-        printk(KERN_ERR "sb orphan head is %d\n", 
+        printk(KERN_ERR "sb orphan head is %d\n",
               le32_to_cpu(sbi->s_es->s_last_orphan));
        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
                struct inode *inode = orphan_list_entry(l);
                printk(KERN_ERR "  "
-                       "inode %s:%ld at %p: mode %o, nlink %d, next %d\n",
+                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
                       inode->i_sb->s_id, inode->i_ino, inode,
-                       inode->i_mode, inode->i_nlink, 
+                       inode->i_mode, inode->i_nlink,
                       NEXT_ORPHAN(inode));
        }
 }
@@ -475,7 +476,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
                inode_init_once(&ei->vfs_inode);
        }
 }
- 
 static int init_inodecache(void)
 {
        ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
@@ -490,8 +491,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(ext3_inode_cachep))
+        kmem_cache_destroy(ext3_inode_cachep);
-                printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n");
 }
 static void ext3_clear_inode(struct inode *inode)
@@ -733,8 +733,8 @@ static match_table_t tokens = {
 static ext3_fsblk_t get_sb_block(void **data)
 {
-        ext3_fsblk_t    sb_block;
+        ext3_fsblk_t    sb_block;
-        char            *options = (char *) *data;
+        char            *options = (char *) *data;
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
@@ -753,7 +753,7 @@ static ext3_fsblk_t get_sb_block(void **data)
 }
 static int parse_options (char *options, struct super_block *sb,
-                          unsigned long *inum, unsigned long *journal_devnum,
+                          unsigned int *inum, unsigned long *journal_devnum,
                          ext3_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
@@ -1174,7 +1174,8 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
        struct ext3_sb_info *sbi = EXT3_SB(sb);
-        ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        ext3_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+        ext3_fsblk_t last_block;
        struct ext3_group_desc * gdp = NULL;
        int desc_block = 0;
        int i;
@@ -1183,12 +1184,17 @@ static int ext3_check_descriptors (struct super_block * sb)
        for (i = 0; i < sbi->s_groups_count; i++)
        {
+                if (i == sbi->s_groups_count - 1)
+                        last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+                else
+                        last_block = first_block +
+                                (EXT3_BLOCKS_PER_GROUP(sb) - 1);
                if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
                        gdp = (struct ext3_group_desc *)
                                        sbi->s_group_desc[desc_block++]->b_data;
-                if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
+                if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
-                    le32_to_cpu(gdp->bg_block_bitmap) >=
+                    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
-                                block + EXT3_BLOCKS_PER_GROUP(sb))
                {
                        ext3_error (sb, "ext3_check_descriptors",
                                    "Block bitmap for group %d"
@@ -1197,9 +1203,8 @@ static int ext3_check_descriptors (struct super_block * sb)
                                        le32_to_cpu(gdp->bg_block_bitmap));
                        return 0;
                }
-                if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
+                if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
-                    le32_to_cpu(gdp->bg_inode_bitmap) >=
+                    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
-                                block + EXT3_BLOCKS_PER_GROUP(sb))
                {
                        ext3_error (sb, "ext3_check_descriptors",
                                    "Inode bitmap for group %d"
@@ -1208,9 +1213,9 @@ static int ext3_check_descriptors (struct super_block * sb)
                                        le32_to_cpu(gdp->bg_inode_bitmap));
                        return 0;
                }
-                if (le32_to_cpu(gdp->bg_inode_table) < block ||
+                if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
-                    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
+                    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >
-                    block + EXT3_BLOCKS_PER_GROUP(sb))
+                    last_block)
                {
                        ext3_error (sb, "ext3_check_descriptors",
                                    "Inode table for group %d"
@@ -1219,7 +1224,7 @@ static int ext3_check_descriptors (struct super_block * sb)
                                        le32_to_cpu(gdp->bg_inode_table));
                        return 0;
                }
-                block += EXT3_BLOCKS_PER_GROUP(sb);
+                first_block += EXT3_BLOCKS_PER_GROUP(sb);
                gdp++;
        }
@@ -1301,17 +1306,17 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                DQUOT_INIT(inode);
                if (inode->i_nlink) {
                        printk(KERN_DEBUG
-                                "%s: truncating inode %ld to %Ld bytes\n",
+                                "%s: truncating inode %lu to %Ld bytes\n",
                                __FUNCTION__, inode->i_ino, inode->i_size);
-                        jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+                        jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
                                  inode->i_ino, inode->i_size);
                        ext3_truncate(inode);
                        nr_truncates++;
                } else {
                        printk(KERN_DEBUG
-                                "%s: deleting unreferenced inode %ld\n",
+                                "%s: deleting unreferenced inode %lu\n",
                                __FUNCTION__, inode->i_ino);
-                        jbd_debug(2, "deleting unreferenced inode %ld\n",
+                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
                        nr_orphans++;
                }
@@ -1390,7 +1395,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        ext3_fsblk_t sb_block = get_sb_block(&data);
        ext3_fsblk_t logic_sb_block;
        unsigned long offset = 0;
-        unsigned long journal_inum = 0;
+        unsigned int journal_inum = 0;
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
@@ -1401,11 +1406,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        int needs_recovery;
        __le32 features;
-        sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(*sbi));
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT3_DEF_RESUID;
        sbi->s_resgid = EXT3_DEF_RESGID;
@@ -1483,7 +1487,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
            (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-                printk(KERN_WARNING 
+                printk(KERN_WARNING
                       "EXT3-fs warning: feature flags set on rev 0 fs, "
                       "running e2fsck is recommended\n");
        /*
@@ -1509,7 +1513,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (blocksize < EXT3_MIN_BLOCK_SIZE ||
            blocksize > EXT3_MAX_BLOCK_SIZE) {
-                printk(KERN_ERR 
+                printk(KERN_ERR
                       "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
                       blocksize, sb->s_id);
                goto failed_mount;
@@ -1533,14 +1537,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
                bh = sb_bread(sb, logic_sb_block);
                if (!bh) {
-                        printk(KERN_ERR 
+                        printk(KERN_ERR
                               "EXT3-fs: Can't read superblock on 2nd try.\n");
                        goto failed_mount;
                }
                es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
-                        printk (KERN_ERR 
+                        printk (KERN_ERR
                                "EXT3-fs: Magic mismatch, very weird !\n");
                        goto failed_mount;
                }
@@ -1622,10 +1626,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext3;
-        sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+        sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
-                               le32_to_cpu(es->s_first_data_block) +
+                               le32_to_cpu(es->s_first_data_block) - 1)
-                               EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+                                       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
-                              EXT3_BLOCKS_PER_GROUP(sb);
        db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
                   EXT3_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
@@ -1820,7 +1823,7 @@ out_fail:
 /*
 * Setup any per-fs journal parameters now.  We'll do this both on
 * initial mount, once the journal has been initialised but before we've
- * done any recovery; and again on any subsequent remount. 
+ * done any recovery; and again on any subsequent remount.
 */
 static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 {
@@ -1840,7 +1843,8 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
        spin_unlock(&journal->j_state_lock);
 }
-static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
+static journal_t *ext3_get_journal(struct super_block *sb,
+                                   unsigned int journal_inum)
 {
        struct inode *journal_inode;
        journal_t *journal;
@@ -1975,7 +1979,7 @@ static int ext3_load_journal(struct super_block *sb,
                             unsigned long journal_devnum)
 {
        journal_t *journal;
-        int journal_inum = le32_to_cpu(es->s_journal_inum);
+        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
        dev_t journal_dev;
        int err = 0;
        int really_read_only;
@@ -2061,7 +2065,7 @@ static int ext3_load_journal(struct super_block *sb,
 static int ext3_create_journal(struct super_block * sb,
                               struct ext3_super_block * es,
-                               int journal_inum)
+                               unsigned int journal_inum)
 {
        journal_t *journal;
@@ -2074,7 +2078,7 @@ static int ext3_create_journal(struct super_block * sb,
        if (!(journal = ext3_get_journal(sb, journal_inum)))
                return -EINVAL;
-        printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
+        printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
               journal_inum);
        if (journal_create(journal)) {
@@ -2342,10 +2346,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                         */
                        ext3_clear_journal_err(sb, es);
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
-                        if ((ret = ext3_group_extend(sb, es, n_blocks_count))) {
+                        if ((err = ext3_group_extend(sb, es, n_blocks_count)))
-                                err = ret;
                                goto restore_opts;
-                        }
                        if (!ext3_setup_super (sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
                }
@@ -2734,7 +2736,7 @@ static int __init init_ext3_fs(void)
 out:
        destroy_inodecache();
 out1:
-        exit_ext3_xattr();
+        exit_ext3_xattr();
        return err;
 }
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index a44a0562203a..f86f2482f01d 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -75,7 +75,7 @@
 #ifdef EXT3_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
-                printk(KERN_DEBUG "inode %s:%ld: ", \
+                printk(KERN_DEBUG "inode %s:%lu: ", \
                        inode->i_sb->s_id, inode->i_ino); \
                printk(f); \
                printk("\n"); \
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext3_xattr_check_block(bh)) {
 bad_block:      ext3_error(inode->i_sb, __FUNCTION__,
-                           "inode %ld: bad block "E3FSBLK, inode->i_ino,
+                           "inode %lu: bad block "E3FSBLK, inode->i_ino,
                           EXT3_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext3_xattr_check_block(bh)) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                           "inode %ld: bad block "E3FSBLK, inode->i_ino,
+                           "inode %lu: bad block "E3FSBLK, inode->i_ino,
                           EXT3_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext3_xattr_check_block(bs->bh)) {
                        ext3_error(sb, __FUNCTION__,
-                                "inode %ld: bad block "E3FSBLK, inode->i_ino,
+                                "inode %lu: bad block "E3FSBLK, inode->i_ino,
                                EXT3_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
@@ -848,7 +848,7 @@ cleanup_dquot:
 bad_block:
        ext3_error(inode->i_sb, __FUNCTION__,
-                   "inode %ld: bad block "E3FSBLK, inode->i_ino,
+                   "inode %lu: bad block "E3FSBLK, inode->i_ino,
                   EXT3_I(inode)->i_file_acl);
        goto cleanup;
@@ -1077,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
        bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
        if (!bh) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                        "inode %ld: block "E3FSBLK" read error", inode->i_ino,
+                        "inode %lu: block "E3FSBLK" read error", inode->i_ino,
                        EXT3_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
                ext3_error(inode->i_sb, __FUNCTION__,
-                        "inode %ld: bad block "E3FSBLK, inode->i_ino,
+                        "inode %lu: bad block "E3FSBLK, inode->i_ino,
                        EXT3_I(inode)->i_file_acl);
                goto cleanup;
        }
@@ -1211,7 +1211,7 @@ again:
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
                        ext3_error(inode->i_sb, __FUNCTION__,
-                                "inode %ld: block %lu read error",
+                                "inode %lu: block %lu read error",
                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT3_XATTR_REFCOUNT_MAX) {
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 97b967b84fc6..82cc4f59e3ba 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -58,8 +58,7 @@ int __init fat_cache_init(void)
 void fat_cache_destroy(void)
 {
-        if (kmem_cache_destroy(fat_cache_cachep))
+        kmem_cache_destroy(fat_cache_cachep);
-                printk(KERN_INFO "fat_cache: not all structures were freed\n");
 }
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 1ee25232e6af..d50fc47169c1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
                      unsigned int cmd, unsigned long arg)
@@ -112,6 +113,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
        }
 }
+static int fat_file_release(struct inode *inode, struct file *filp)
+{
+        if ((filp->f_mode & FMODE_WRITE) &&
+             MSDOS_SB(inode->i_sb)->options.flush) {
+                fat_flush_inodes(inode->i_sb, inode, NULL);
+                blk_congestion_wait(WRITE, HZ/10);
+        }
+        return 0;
+}
 const struct file_operations fat_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -121,6 +132,7 @@ const struct file_operations fat_file_operations = {
        .aio_read       = generic_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
+        .release        = fat_file_release,
        .ioctl          = fat_generic_ioctl,
        .fsync          = file_fsync,
        .sendfile       = generic_file_sendfile,
@@ -289,6 +301,7 @@ void fat_truncate(struct inode *inode)
        lock_kernel();
        fat_free(inode, nr_clusters);
        unlock_kernel();
+        fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 struct inode_operations fat_file_inode_operations = {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 31b7174176ba..045738032a83 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -24,6 +24,7 @@
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
+#include <linux/writeback.h>
 #include <asm/unaligned.h>
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -50,14 +51,14 @@ static int fat_add_cluster(struct inode *inode)
        return err;
 }
-static int __fat_get_blocks(struct inode *inode, sector_t iblock,
+static inline int __fat_get_block(struct inode *inode, sector_t iblock,
-                            unsigned long *max_blocks,
+                                  unsigned long *max_blocks,
-                            struct buffer_head *bh_result, int create)
+                                  struct buffer_head *bh_result, int create)
 {
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        sector_t phys;
        unsigned long mapped_blocks;
+        sector_t phys;
        int err, offset;
        err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
@@ -73,7 +74,7 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock,
        if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
                fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
-                             MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
+                        MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
                return -EIO;
        }
@@ -93,34 +94,29 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock,
        err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
        if (err)
                return err;
        BUG_ON(!phys);
        BUG_ON(*max_blocks != mapped_blocks);
        set_buffer_new(bh_result);
        map_bh(bh_result, sb, phys);
        return 0;
 }
-static int fat_get_blocks(struct inode *inode, sector_t iblock,
+static int fat_get_block(struct inode *inode, sector_t iblock,
-                          struct buffer_head *bh_result, int create)
+                         struct buffer_head *bh_result, int create)
 {
        struct super_block *sb = inode->i_sb;
-        int err;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+        int err;
-        err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+        err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create);
        if (err)
                return err;
        bh_result->b_size = max_blocks << sb->s_blocksize_bits;
        return 0;
 }
-static int fat_get_block(struct inode *inode, sector_t iblock,
-                         struct buffer_head *bh_result, int create)
-{
-        unsigned long max_blocks = 1;
-        return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
-}
 static int fat_writepage(struct page *page, struct writeback_control *wbc)
 {
        return block_write_full_page(page, fat_get_block, wbc);
@@ -188,7 +184,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
         * condition of fat_get_block() and ->truncate().
         */
        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                  offset, nr_segs, fat_get_blocks, NULL);
+                                  offset, nr_segs, fat_get_block, NULL);
 }
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -375,8 +371,6 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
                        inode->i_flags |= S_IMMUTABLE;
        }
        MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
-        /* this is as close to the truth as we can get ... */
-        inode->i_blksize = sbi->cluster_size;
        inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
                           & ~((loff_t)sbi->cluster_size - 1)) >> 9;
        inode->i_mtime.tv_sec =
@@ -528,8 +522,7 @@ static int __init fat_init_inodecache(void)
 static void __exit fat_destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(fat_inode_cachep))
+        kmem_cache_destroy(fat_inode_cachep);
-                printk(KERN_INFO "fat_inode_cache: not all structures were freed\n");
 }
 static int fat_remount(struct super_block *sb, int *flags, char *data)
@@ -861,7 +854,7 @@ enum {
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-        Opt_obsolate, Opt_err,
+        Opt_obsolate, Opt_flush, Opt_err,
 };
 static match_table_t fat_tokens = {
@@ -893,7 +886,8 @@ static match_table_t fat_tokens = {
        {Opt_obsolate, "cvf_format=%20s"},
        {Opt_obsolate, "cvf_options=%100s"},
        {Opt_obsolate, "posix"},
-        {Opt_err, NULL}
+        {Opt_flush, "flush"},
+        {Opt_err, NULL},
 };
 static match_table_t msdos_tokens = {
        {Opt_nodots, "nodots"},
@@ -1034,6 +1028,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                                return 0;
                        opts->codepage = option;
                        break;
+                case Opt_flush:
+                        opts->flush = 1;
+                        break;
                /* msdos specific */
                case Opt_dots:
@@ -1137,7 +1134,6 @@ static int fat_read_root(struct inode *inode)
                MSDOS_I(inode)->i_start = 0;
                inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry);
        }
-        inode->i_blksize = sbi->cluster_size;
        inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
                           & ~((loff_t)sbi->cluster_size - 1)) >> 9;
        MSDOS_I(inode)->i_logstart = 0;
@@ -1168,11 +1164,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        long error;
        char buf[50];
-        sbi = kmalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(struct msdos_sb_info));
        sb->s_flags |= MS_NODIRATIME;
        sb->s_magic = MSDOS_SUPER_MAGIC;
@@ -1435,6 +1430,56 @@ out_fail:
 EXPORT_SYMBOL_GPL(fat_fill_super);
+/*
+ * helper function for fat_flush_inodes.  This writes both the inode
+ * and the file data blocks, waiting for in flight data blocks before
+ * the start of the call.  It does not wait for any io started
+ * during the call
+ */
+static int writeback_inode(struct inode *inode)
+{
+        int ret;
+        struct address_space *mapping = inode->i_mapping;
+        struct writeback_control wbc = {
+               .sync_mode = WB_SYNC_NONE,
+              .nr_to_write = 0,
+        };
+        /* if we used WB_SYNC_ALL, sync_inode waits for the io for the
+        * inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
+        * and filemap_fdatawrite is used for the data blocks
+        */
+        ret = sync_inode(inode, &wbc);
+        if (!ret)
+               ret = filemap_fdatawrite(mapping);
+        return ret;
+}
+/*
+ * write data and metadata corresponding to i1 and i2.  The io is
+ * started but we do not wait for any of it to finish.
+ *
+ * filemap_flush is used for the block device, so if there is a dirty
+ * page for a block already in flight, we will not wait and start the
+ * io over again
+ */
+int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
+{
+        int ret = 0;
+        if (!MSDOS_SB(sb)->options.flush)
+                return 0;
+        if (i1)
+                ret = writeback_inode(i1);
+        if (!ret && i2)
+                ret = writeback_inode(i2);
+        if (!ret && sb) {
+                struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+                ret = filemap_flush(mapping);
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(fat_flush_inodes);
 static int __init init_fat_fs(void)
 {
        int err;
diff --git a/fs/file.c b/fs/file.c
index b3c6b82e6a9d..8e81775c5dc8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -281,80 +281,70 @@ static struct fdtable *alloc_fdtable(int nr)
 out2:
        nfds = fdt->max_fdset;
 out:
-        if (new_openset)
+        free_fdset(new_openset, nfds);
-                free_fdset(new_openset, nfds);
+        free_fdset(new_execset, nfds);
-        if (new_execset)
-                free_fdset(new_execset, nfds);
        kfree(fdt);
        return NULL;
 }
 /*
- * Expands the file descriptor table - it will allocate a new fdtable and
+ * Expand the file descriptor table.
- * both fd array and fdset. It is expected to be called with the
+ * This function will allocate a new fdtable and both fd array and fdset, of
- * files_lock held.
+ * the given size.
+ * Return <0 error code on error; 1 on successful completion.
+ * The files->file_lock should be held on entry, and will be held on exit.
 */
 static int expand_fdtable(struct files_struct *files, int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
 {
-        int error = 0;
+        struct fdtable *new_fdt, *cur_fdt;
-        struct fdtable *fdt;
-        struct fdtable *nfdt = NULL;
        spin_unlock(&files->file_lock);
-        nfdt = alloc_fdtable(nr);
+        new_fdt = alloc_fdtable(nr);
-        if (!nfdt) {
-                error = -ENOMEM;
-                spin_lock(&files->file_lock);
-                goto out;
-        }
        spin_lock(&files->file_lock);
-        fdt = files_fdtable(files);
+        if (!new_fdt)
+                return -ENOMEM;
        /*
-         * Check again since another task may have expanded the
+         * Check again since another task may have expanded the fd table while
-         * fd table while we dropped the lock
+         * we dropped the lock
         */
-        if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
+        cur_fdt = files_fdtable(files);
-                copy_fdtable(nfdt, fdt);
+        if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) {
+                /* Continue as planned */
+                copy_fdtable(new_fdt, cur_fdt);
+                rcu_assign_pointer(files->fdt, new_fdt);
+                free_fdtable(cur_fdt);
        } else {
-                /* Somebody expanded while we dropped file_lock */
+                /* Somebody else expanded, so undo our attempt */
-                spin_unlock(&files->file_lock);
+                __free_fdtable(new_fdt);
-                __free_fdtable(nfdt);
-                spin_lock(&files->file_lock);
-                goto out;
        }
-        rcu_assign_pointer(files->fdt, nfdt);
+        return 1;
-        free_fdtable(fdt);
-out:
-        return error;
 }
 /*
 * Expand files.
- * Return <0 on error; 0 nothing done; 1 files expanded, we may have blocked.
+ * This function will expand the file structures, if the requested size exceeds
- * Should be called with the files->file_lock spinlock held for write.
+ * the current capacity and there is room for expansion.
+ * Return <0 error code on error; 0 when nothing done; 1 when files were
+ * expanded and execution may have blocked.
+ * The files->file_lock should be held on entry, and will be held on exit.
 */
 int expand_files(struct files_struct *files, int nr)
 {
-        int err, expand = 0;
        struct fdtable *fdt;
        fdt = files_fdtable(files);
-        if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
+        /* Do we need to expand? */
-                if (fdt->max_fdset >= NR_OPEN ||
+        if (nr < fdt->max_fdset && nr < fdt->max_fds)
-                        fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
+                return 0;
-                        err = -EMFILE;
+        /* Can we expand? */
-                        goto out;
+        if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN ||
-                }
+            nr >= NR_OPEN)
-                expand = 1;
+                return -EMFILE;
-                if ((err = expand_fdtable(files, nr)))
-                        goto out;
+        /* All good, so we try */
-        }
+        return expand_fdtable(files, nr);
-        err = expand;
-out:
-        return err;
 }
 static void __devinit fdtable_defer_list_init(int cpu)
diff --git a/fs/file_table.c b/fs/file_table.c
index 0131ba06e1ee..bc35a40417d7 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,7 +169,7 @@ void fastcall __fput(struct file *file)
        if (file->f_op && file->f_op->release)
                file->f_op->release(inode, file);
        security_file_free(file);
-        if (unlikely(inode->i_cdev != NULL))
+        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
                cdev_put(inode->i_cdev);
        fops_put(file->f_op);
        if (file->f_mode & FMODE_WRITE)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 9f1072836c8e..e3fa77c6ed56 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -69,8 +69,6 @@ int register_filesystem(struct file_system_type * fs)
        int res = 0;
        struct file_system_type ** p;
-        if (!fs)
-                return -EINVAL;
        if (fs->next)
                return -EBUSY;
        INIT_LIST_HEAD(&fs->fs_supers);
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index d35979a58743..c8a92652612a 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -252,7 +252,7 @@ enum {
 * Get filesystem private data from VFS inode.
 */
 #define VXFS_INO(ip) \
-        ((struct vxfs_inode_info *)(ip)->u.generic_ip)
+        ((struct vxfs_inode_info *)(ip)->i_private)
 /*
 * Get filesystem private data from VFS superblock.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ca6a39714771..4786d51ad3bd 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -239,11 +239,10 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
        ip->i_ctime.tv_nsec = 0;
        ip->i_mtime.tv_nsec = 0;
-        ip->i_blksize = PAGE_SIZE;
        ip->i_blocks = vip->vii_blocks;
        ip->i_generation = vip->vii_gen;
-        ip->u.generic_ip = (void *)vip;
+        ip->i_private = vip;
        
 }
@@ -338,5 +337,5 @@ vxfs_read_inode(struct inode *ip)
 void
 vxfs_clear_inode(struct inode *ip)
 {
-        kmem_cache_free(vxfs_inode_cachep, ip->u.generic_ip);
+        kmem_cache_free(vxfs_inode_cachep, ip->i_private);
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b74b791fc23b..ac28b0835ffc 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -260,12 +260,17 @@ static struct file_system_type vxfs_fs_type = {
 static int __init
 vxfs_init(void)
 {
+        int rv;
        vxfs_inode_cachep = kmem_cache_create("vxfs_inode",
                        sizeof(struct vxfs_inode_info), 0, 
                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
-        if (vxfs_inode_cachep)
+        if (!vxfs_inode_cachep)
-                return register_filesystem(&vxfs_fs_type);
+                return -ENOMEM;
-        return -ENOMEM;
+        rv = register_filesystem(&vxfs_fs_type);
+        if (rv < 0)
+                kmem_cache_destroy(vxfs_inode_cachep);
+        return rv;
 }
 static void __exit
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 46fe60b2da23..79ec1f23d4d2 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
 {
        struct fuse_conn *fc;
        mutex_lock(&fuse_mutex);
-        fc = file->f_dentry->d_inode->u.generic_ip;
+        fc = file->f_dentry->d_inode->i_private;
        if (fc)
                fc = fuse_conn_get(fc);
        mutex_unlock(&fuse_mutex);
@@ -98,7 +98,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
                inode->i_op = iop;
        inode->i_fop = fop;
        inode->i_nlink = nlink;
-        inode->u.generic_ip = fc;
+        inode->i_private = fc;
        d_add(dentry, inode);
        return dentry;
 }
@@ -150,7 +150,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
        for (i = fc->ctl_ndents - 1; i >= 0; i--) {
                struct dentry *dentry = fc->ctl_dentry[i];
-                dentry->d_inode->u.generic_ip = NULL;
+                dentry->d_inode->i_private = NULL;
                d_drop(dentry);
                dput(dentry);
        }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1e2006caf158..4fc557c40cc0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -212,6 +212,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 * Called with fc->lock, unlocks it
 */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+        __releases(fc->lock)
 {
        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
        req->end = NULL;
@@ -640,6 +641,7 @@ static void request_wait(struct fuse_conn *fc)
 */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
                               const struct iovec *iov, unsigned long nr_segs)
+        __releases(fc->lock)
 {
        struct fuse_copy_state cs;
        struct fuse_in_header ih;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 409ce6a7cca4..f85b2a282f13 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -776,7 +776,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
                if ((mask & MAY_EXEC) && !S_ISDIR(mode) && !(mode & S_IXUGO))
                        return -EACCES;
-                if (nd && (nd->flags & LOOKUP_ACCESS))
+                if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR)))
                        return fuse_access(inode, mask);
                return 0;
        }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d25092262ae..7d0a9aee01f2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -118,7 +118,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
        inode->i_uid     = attr->uid;
        inode->i_gid     = attr->gid;
        i_size_write(inode, attr->size);
-        inode->i_blksize = PAGE_CACHE_SIZE;
        inode->i_blocks  = attr->blocks;
        inode->i_atime.tv_sec   = attr->atime;
        inode->i_atime.tv_nsec  = attr->atimensec;
@@ -252,6 +251,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        memset(&outarg, 0, sizeof(outarg));
        req->in.numargs = 0;
        req->in.h.opcode = FUSE_STATFS;
+        req->in.h.nodeid = get_node_id(dentry->d_inode);
        req->out.numargs = 1;
        req->out.args[0].size =
                fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
new file mode 100644
index 000000000000..9ccb78947171
--- /dev/null
+++ b/fs/generic_acl.c
@@ -0,0 +1,197 @@
+/*
+ * fs/generic_acl.c
+ *
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/generic_acl.h>
+/**
+ * generic_acl_list  -  Generic xattr_handler->list() operation
+ * @ops:        Filesystem specific getacl and setacl callbacks
+ */
+size_t
+generic_acl_list(struct inode *inode, struct generic_acl_operations *ops,
+                 int type, char *list, size_t list_size)
+{
+        struct posix_acl *acl;
+        const char *name;
+        size_t size;
+        acl = ops->getacl(inode, type);
+        if (!acl)
+                return 0;
+        posix_acl_release(acl);
+        switch(type) {
+                case ACL_TYPE_ACCESS:
+                        name = POSIX_ACL_XATTR_ACCESS;
+                        break;
+                case ACL_TYPE_DEFAULT:
+                        name = POSIX_ACL_XATTR_DEFAULT;
+                        break;
+                default:
+                        return 0;
+        }
+        size = strlen(name) + 1;
+        if (list && size <= list_size)
+                memcpy(list, name, size);
+        return size;
+}
+/**
+ * generic_acl_get  -  Generic xattr_handler->get() operation
+ * @ops:        Filesystem specific getacl and setacl callbacks
+ */
+int
+generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
+                int type, void *buffer, size_t size)
+{
+        struct posix_acl *acl;
+        int error;
+        acl = ops->getacl(inode, type);
+        if (!acl)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return error;
+}
+/**
+ * generic_acl_set  -  Generic xattr_handler->set() operation
+ * @ops:        Filesystem specific getacl and setacl callbacks
+ */
+int
+generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
+                int type, const void *value, size_t size)
+{
+        struct posix_acl *acl = NULL;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+        }
+        if (acl) {
+                mode_t mode;
+                error = posix_acl_valid(acl);
+                if (error)
+                        goto failed;
+                switch(type) {
+                        case ACL_TYPE_ACCESS:
+                                mode = inode->i_mode;
+                                error = posix_acl_equiv_mode(acl, &mode);
+                                if (error < 0)
+                                        goto failed;
+                                inode->i_mode = mode;
+                                if (error == 0) {
+                                        posix_acl_release(acl);
+                                        acl = NULL;
+                                }
+                                break;
+                        case ACL_TYPE_DEFAULT:
+                                if (!S_ISDIR(inode->i_mode)) {
+                                        error = -EINVAL;
+                                        goto failed;
+                                }
+                                break;
+                }
+        }
+        ops->setacl(inode, type, acl);
+        error = 0;
+failed:
+        posix_acl_release(acl);
+        return error;
+}
+/**
+ * generic_acl_init  -  Take care of acl inheritance at @inode create time
+ * @ops:        Filesystem specific getacl and setacl callbacks
+ *
+ * Files created inside a directory with a default ACL inherit the
+ * directory's default ACL.
+ */
+int
+generic_acl_init(struct inode *inode, struct inode *dir,
+                 struct generic_acl_operations *ops)
+{
+        struct posix_acl *acl = NULL;
+        mode_t mode = inode->i_mode;
+        int error;
+        inode->i_mode = mode & ~current->fs->umask;
+        if (!S_ISLNK(inode->i_mode))
+                acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
+        if (acl) {
+                struct posix_acl *clone;
+                if (S_ISDIR(inode->i_mode)) {
+                        clone = posix_acl_clone(acl, GFP_KERNEL);
+                        error = -ENOMEM;
+                        if (!clone)
+                                goto cleanup;
+                        ops->setacl(inode, ACL_TYPE_DEFAULT, clone);
+                        posix_acl_release(clone);
+                }
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                error = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                error = posix_acl_create_masq(clone, &mode);
+                if (error >= 0) {
+                        inode->i_mode = mode;
+                        if (error > 0)
+                                ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+                }
+                posix_acl_release(clone);
+        }
+        error = 0;
+cleanup:
+        posix_acl_release(acl);
+        return error;
+}
+/**
+ * generic_acl_chmod  -  change the access acl of @inode upon chmod()
+ * @ops:        FIlesystem specific getacl and setacl callbacks
+ *
+ * A chmod also changes the permissions of the owner, group/mask, and
+ * other ACL entries.
+ */
+int
+generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
+{
+        struct posix_acl *acl, *clone;
+        int error = 0;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = ops->getacl(inode, ACL_TYPE_ACCESS);
+        if (acl) {
+                clone = posix_acl_clone(acl, GFP_KERNEL);
+                posix_acl_release(acl);
+                if (!clone)
+                        return -ENOMEM;
+                error = posix_acl_chmod_masq(clone, inode->i_mode);
+                if (!error)
+                        ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+                posix_acl_release(clone);
+        }
+        return error;
+}
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 13231dd5ce66..0d200068d0af 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -249,10 +249,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        sb = tree->inode->i_sb;
        size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
                sizeof(struct page *);
-        node = kmalloc(size, GFP_KERNEL);
+        node = kzalloc(size, GFP_KERNEL);
        if (!node)
                return NULL;
-        memset(node, 0, size);
        node->tree = tree;
        node->this = cnid;
        set_bit(HFS_BNODE_NEW, &node->flags);
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 400357994319..5fd0ed71f923 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -21,10 +21,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
        struct page *page;
        unsigned int size;
-        tree = kmalloc(sizeof(*tree), GFP_KERNEL);
+        tree = kzalloc(sizeof(*tree), GFP_KERNEL);
        if (!tree)
                return NULL;
-        memset(tree, 0, sizeof(*tree));
        init_MUTEX(&tree->tree_lock);
        spin_lock_init(&tree->hash_lock);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 315cf44a90b2..d05641c35fc9 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -154,7 +154,6 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
        inode->i_gid = current->fsgid;
        inode->i_nlink = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-        inode->i_blksize = HFS_SB(sb)->alloc_blksz;
        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
        HFS_I(inode)->fs_blocks = 0;
@@ -284,7 +283,6 @@ static int hfs_read_inode(struct inode *inode, void *data)
        inode->i_uid = hsb->s_uid;
        inode->i_gid = hsb->s_gid;
        inode->i_nlink = 1;
-        inode->i_blksize = HFS_SB(inode->i_sb)->alloc_blksz;
        if (idata->key)
                HFS_I(inode)->cat_key = *idata->key;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34937ee83ab1..d43b4fcc8ad3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -356,11 +356,10 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root_inode;
        int res;
-        sbi = kmalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(struct hfs_sb_info));
        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
        res = -EINVAL;
@@ -455,8 +454,7 @@ static int __init init_hfs_fs(void)
 static void __exit exit_hfs_fs(void)
 {
        unregister_filesystem(&hfs_fs_type);
-        if (kmem_cache_destroy(hfs_inode_cachep))
+        kmem_cache_destroy(hfs_inode_cachep);
-                printk(KERN_ERR "hfs_inode_cache: not all structures were freed\n");
 }
 module_init(init_hfs_fs)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 77bf434da679..29da6574ba77 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -409,10 +409,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
        sb = tree->inode->i_sb;
        size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
                sizeof(struct page *);
-        node = kmalloc(size, GFP_KERNEL);
+        node = kzalloc(size, GFP_KERNEL);
        if (!node)
                return NULL;
-        memset(node, 0, size);
        node->tree = tree;
        node->this = cnid;
        set_bit(HFS_BNODE_NEW, &node->flags);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index cfc852fdd1b5..a9b9e872e29a 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -24,10 +24,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
        struct page *page;
        unsigned int size;
-        tree = kmalloc(sizeof(*tree), GFP_KERNEL);
+        tree = kzalloc(sizeof(*tree), GFP_KERNEL);
        if (!tree)
                return NULL;
-        memset(tree, 0, sizeof(*tree));
        init_MUTEX(&tree->tree_lock);
        spin_lock_init(&tree->hash_lock);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 924ecdef8091..0eb1a6092668 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -304,7 +304,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
        inode->i_gid = current->fsgid;
        inode->i_nlink = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-        inode->i_blksize = HFSPLUS_SB(sb).alloc_blksz;
        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
        init_MUTEX(&HFSPLUS_I(inode).extents_lock);
        atomic_set(&HFSPLUS_I(inode).opencnt, 0);
@@ -407,7 +406,6 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
        type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
        HFSPLUS_I(inode).dev = 0;
-        inode->i_blksize = HFSPLUS_SB(inode->i_sb).alloc_blksz;
        if (type == HFSPLUS_FOLDER) {
                struct hfsplus_cat_folder *folder = &entry.folder;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index d279d5924f28..194eede52fa4 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -493,8 +493,7 @@ static int __init init_hfsplus_fs(void)
 static void __exit exit_hfsplus_fs(void)
 {
        unregister_filesystem(&hfsplus_fs_type);
-        if (kmem_cache_destroy(hfsplus_inode_cachep))
+        kmem_cache_destroy(hfsplus_inode_cachep);
-                printk(KERN_ERR "hfsplus_inode_cache: not all structures were freed\n");
 }
 module_init(init_hfsplus_fs)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b82e3d9c8790..322e876c35ed 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -156,7 +156,6 @@ static int read_name(struct inode *ino, char *name)
        ino->i_mode = i_mode;
        ino->i_nlink = i_nlink;
        ino->i_size = i_size;
-        ino->i_blksize = i_blksize;
        ino->i_blocks = i_blocks;
        return(0);
 }
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 2807aa833e62..b52b7381d10f 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -76,7 +76,7 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
                return NULL;
        }
-        qbh->data = data = (char *)kmalloc(2048, GFP_NOFS);
+        qbh->data = data = kmalloc(2048, GFP_NOFS);
        if (!data) {
                printk("HPFS: hpfs_map_4sectors: out of memory\n");
                goto bail;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index f687d54ed442..32ab51e42b96 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -12,7 +12,6 @@
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/hpfs_fs.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f2c338c4d9..bcf6ee36e065 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -17,7 +17,6 @@ void hpfs_init_inode(struct inode *i)
        i->i_gid = hpfs_sb(sb)->sb_gid;
        i->i_mode = hpfs_sb(sb)->sb_mode;
        hpfs_inode->i_conv = hpfs_sb(sb)->sb_conv;
-        i->i_blksize = 512;
        i->i_size = -1;
        i->i_blocks = -1;
        
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f798480a363f..450b5e0b4785 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -11,6 +11,7 @@
 #include <linux/parser.h>
 #include <linux/init.h>
 #include <linux/statfs.h>
+#include <linux/magic.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -202,8 +203,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(hpfs_inode_cachep))
+        kmem_cache_destroy(hpfs_inode_cachep);
-                printk(KERN_INFO "hpfs_inode_cache: not all structures were freed\n");
 }
 /*
@@ -461,11 +461,10 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        int o;
-        sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        s->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(*sbi));
        sbi->sb_bmp_dir = NULL;
        sbi->sb_cp_table = NULL;
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 3a9bdf58166f..dcb6d2e988b8 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -152,7 +152,6 @@ static void hppfs_read_inode(struct inode *ino)
        ino->i_mode = proc_ino->i_mode;
        ino->i_nlink = proc_ino->i_nlink;
        ino->i_size = proc_ino->i_size;
-        ino->i_blksize = proc_ino->i_blksize;
        ino->i_blocks = proc_ino->i_blocks;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c3920c96dadf..f5b8f329aca6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -229,7 +229,7 @@ static void hugetlbfs_delete_inode(struct inode *inode)
        clear_inode(inode);
 }
-static void hugetlbfs_forget_inode(struct inode *inode)
+static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
 {
        struct super_block *sb = inode->i_sb;
@@ -357,7 +357,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
-                inode->i_blksize = HPAGE_SIZE;
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
diff --git a/fs/inode.c b/fs/inode.c
index 0bf9f0444a96..abf77471e6c4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -133,7 +133,6 @@ static struct inode *alloc_inode(struct super_block *sb)
                inode->i_bdev = NULL;
                inode->i_cdev = NULL;
                inode->i_rdev = 0;
-                inode->i_security = NULL;
                inode->dirtied_when = 0;
                if (security_inode_alloc(inode)) {
                        if (inode->i_sb->s_op->destroy_inode)
@@ -163,7 +162,7 @@ static struct inode *alloc_inode(struct super_block *sb)
                                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
                        mapping->backing_dev_info = bdi;
                }
-                memset(&inode->u, 0, sizeof(inode->u));
+                inode->i_private = 0;
                inode->i_mapping = mapping;
        }
        return inode;
@@ -254,9 +253,9 @@ void clear_inode(struct inode *inode)
        DQUOT_DROP(inode);
        if (inode->i_sb && inode->i_sb->s_op->clear_inode)
                inode->i_sb->s_op->clear_inode(inode);
-        if (inode->i_bdev)
+        if (S_ISBLK(inode->i_mode) && inode->i_bdev)
                bd_forget(inode);
-        if (inode->i_cdev)
+        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);
        inode->i_state = I_CLEAR;
 }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 14391361c886..c34b862cdbf2 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -96,9 +96,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(isofs_inode_cachep))
+        kmem_cache_destroy(isofs_inode_cachep);
-                printk(KERN_INFO "iso_inode_cache: not all structures were "
-                                        "freed\n");
 }
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
@@ -557,11 +555,10 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
        struct iso9660_options          opt;
        struct isofs_sb_info          * sbi;
-        sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        s->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(*sbi));
        if (!parse_options((char *)data, &opt))
                goto out_freesbi;
@@ -963,30 +960,30 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
                        goto abort;
                }
                
-                if (nextblk) {
+                /* On the last section, nextblk == 0, section size is likely to
-                        while (b_off >= (offset + sect_size)) {
+                 * exceed sect_size by a partial block, and access beyond the
-                                struct inode *ninode;
+                 * end of the file will reach beyond the section size, too.
-                                
+                 */
-                                offset += sect_size;
+                while (nextblk && (b_off >= (offset + sect_size))) {
-                                if (nextblk == 0)
+                        struct inode *ninode;
-                                        goto abort;
-                                ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
+                        offset += sect_size;
-                                if (!ninode)
+                        ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
-                                        goto abort;
+                        if (!ninode)
-                                firstext  = ISOFS_I(ninode)->i_first_extent;
+                                goto abort;
-                                sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
+                        firstext  = ISOFS_I(ninode)->i_first_extent;
-                                nextblk   = ISOFS_I(ninode)->i_next_section_block;
+                        sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
-                                nextoff   = ISOFS_I(ninode)->i_next_section_offset;
+                        nextblk   = ISOFS_I(ninode)->i_next_section_block;
-                                iput(ninode);
+                        nextoff   = ISOFS_I(ninode)->i_next_section_offset;
-                                
+                        iput(ninode);
-                                if (++section > 100) {
-                                        printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n");
+                        if (++section > 100) {
-                                        printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u "
+                                printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n");
-                                               "nextblk=%lu nextoff=%lu\n",
+                                printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u "
-                                               iblock, firstext, (unsigned) sect_size,
+                                       "nextblk=%lu nextoff=%lu\n",
-                                               nextblk, nextoff);
+                                       iblock, firstext, (unsigned) sect_size,
-                                        goto abort;
+                                       nextblk, nextoff);
-                                }
+                                goto abort;
                        }
                }
                
@@ -1238,7 +1235,7 @@ static void isofs_read_inode(struct inode *inode)
        }
        inode->i_uid = sbi->s_uid;
        inode->i_gid = sbi->s_gid;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        ei->i_format_parm[0] = 0;
        ei->i_format_parm[1] = 0;
@@ -1294,7 +1291,6 @@ static void isofs_read_inode(struct inode *inode)
                              isonum_711 (de->ext_attr_length));
        /* Set the number of blocks for stat() - should be done before RR */
-        inode->i_blksize = PAGE_CACHE_SIZE; /* For stat() only */
        inode->i_blocks  = (inode->i_size + 511) >> 9;
        /*
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 47678a26c13b..0208cc7ac5d0 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -1,6 +1,6 @@
 /*
 * linux/fs/checkpoint.c
- * 
+ *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1999 Red Hat Software --- All Rights Reserved
@@ -9,8 +9,8 @@
 * the terms of the GNU General Public License, version 2, or at your
 * option, any later version, incorporated herein by reference.
 *
- * Checkpoint routines for the generic filesystem journaling code.  
+ * Checkpoint routines for the generic filesystem journaling code.
- * Part of the ext2fs journaling system.  
+ * Part of the ext2fs journaling system.
 *
 * Checkpointing is the process of ensuring that a section of the log is
 * committed fully to disk, so that that portion of the log can be
@@ -145,6 +145,7 @@ void __log_wait_for_space(journal_t *journal)
 * jbd_unlock_bh_state().
 */
 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+        __releases(journal->j_list_lock)
 {
        get_bh(bh);
        spin_unlock(&journal->j_list_lock);
@@ -225,7 +226,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Try to flush one buffer from the checkpoint list to disk.
 *
 * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.  
+ * scan of the checkpoint list.
 *
 * Called with j_list_lock held and drops it if 1 is returned
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -269,7 +270,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                 * possibly block, while still holding the journal lock.
                 * We cannot afford to let the transaction logic start
                 * messing around with this buffer before we write it to
-                 * disk, as that would break recoverability.  
+                 * disk, as that would break recoverability.
                 */
                BUFFER_TRACE(bh, "queue");
                get_bh(bh);
@@ -292,7 +293,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 * Perform an actual checkpoint. We take the first transaction on the
 * list of transactions to be checkpointed and send all its buffers
 * to disk. We submit larger chunks of data at once.
- * 
+ *
 * The journal should be locked before calling this function.
 */
 int log_do_checkpoint(journal_t *journal)
@@ -303,10 +304,10 @@ int log_do_checkpoint(journal_t *journal)
        jbd_debug(1, "Start checkpoint\n");
-        /* 
+        /*
         * First thing: if there are any transactions in the log which
         * don't need checkpointing, just eliminate them from the
-         * journal straight away.  
+         * journal straight away.
         */
        result = cleanup_journal_tail(journal);
        jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
@@ -384,9 +385,9 @@ out:
 * we have already got rid of any since the last update of the log tail
 * in the journal superblock.  If so, we can instantly roll the
 * superblock forward to remove those transactions from the log.
- * 
+ *
 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
- * 
+ *
 * Called with the journal lock held.
 *
 * This is the only part of the journaling code which really needs to be
@@ -403,8 +404,8 @@ int cleanup_journal_tail(journal_t *journal)
        unsigned long   blocknr, freed;
        /* OK, work out the oldest transaction remaining in the log, and
-         * the log block it starts at. 
+         * the log block it starts at.
-         * 
+         *
         * If the log is now empty, we need to work out which is the
         * next transaction ID we will write, and where it will
         * start. */
@@ -479,7 +480,7 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
        if (!jh)
                return 0;
-        last_jh = jh->b_cpprev;
+        last_jh = jh->b_cpprev;
        do {
                jh = next_jh;
                next_jh = jh->b_cpnext;
@@ -557,7 +558,7 @@ out:
        return ret;
 }
-/* 
+/*
 * journal_remove_checkpoint: called after a buffer has been committed
 * to disk (either by being write-back flushed to disk, or being
 * committed to the log).
@@ -635,7 +636,7 @@ out:
 * Called with the journal locked.
 * Called with j_list_lock held.
 */
-void __journal_insert_checkpoint(struct journal_head *jh, 
+void __journal_insert_checkpoint(struct journal_head *jh,
                               transaction_t *transaction)
 {
        JBUFFER_TRACE(jh, "entry");
@@ -657,7 +658,7 @@ void __journal_insert_checkpoint(struct journal_head *jh,
 /*
 * We've finished with this transaction structure: adios...
- * 
+ *
 * The transaction must have no links except for the checkpoint by this
 * point.
 *
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 42da60784311..32a8caf0c41e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -160,6 +160,117 @@ static int journal_write_commit_record(journal_t *journal,
        return (ret == -EIO);
 }
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+        int i;
+        for (i = 0; i < bufs; i++) {
+                wbuf[i]->b_end_io = end_buffer_write_sync;
+                /* We use-up our safety reference in submit_bh() */
+                submit_bh(WRITE, wbuf[i]);
+        }
+}
+/*
+ *  Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+                                transaction_t *commit_transaction)
+{
+        struct journal_head *jh;
+        struct buffer_head *bh;
+        int locked;
+        int bufs = 0;
+        struct buffer_head **wbuf = journal->j_wbuf;
+        /*
+         * Whenever we unlock the journal and sleep, things can get added
+         * onto ->t_sync_datalist, so we have to keep looping back to
+         * write_out_data until we *know* that the list is empty.
+         *
+         * Cleanup any flushed data buffers from the data list.  Even in
+         * abort mode, we want to flush this out as soon as possible.
+         */
+write_out_data:
+        cond_resched();
+        spin_lock(&journal->j_list_lock);
+        while (commit_transaction->t_sync_datalist) {
+                jh = commit_transaction->t_sync_datalist;
+                bh = jh2bh(jh);
+                locked = 0;
+                /* Get reference just to make sure buffer does not disappear
+                 * when we are forced to drop various locks */
+                get_bh(bh);
+                /* If the buffer is dirty, we need to submit IO and hence
+                 * we need the buffer lock. We try to lock the buffer without
+                 * blocking. If we fail, we need to drop j_list_lock and do
+                 * blocking lock_buffer().
+                 */
+                if (buffer_dirty(bh)) {
+                        if (test_set_buffer_locked(bh)) {
+                                BUFFER_TRACE(bh, "needs blocking lock");
+                                spin_unlock(&journal->j_list_lock);
+                                /* Write out all data to prevent deadlocks */
+                                journal_do_submit_data(wbuf, bufs);
+                                bufs = 0;
+                                lock_buffer(bh);
+                                spin_lock(&journal->j_list_lock);
+                        }
+                        locked = 1;
+                }
+                /* We have to get bh_state lock. Again out of order, sigh. */
+                if (!inverted_lock(journal, bh)) {
+                        jbd_lock_bh_state(bh);
+                        spin_lock(&journal->j_list_lock);
+                }
+                /* Someone already cleaned up the buffer? */
+                if (!buffer_jbd(bh)
+                        || jh->b_transaction != commit_transaction
+                        || jh->b_jlist != BJ_SyncData) {
+                        jbd_unlock_bh_state(bh);
+                        if (locked)
+                                unlock_buffer(bh);
+                        BUFFER_TRACE(bh, "already cleaned up");
+                        put_bh(bh);
+                        continue;
+                }
+                if (locked && test_clear_buffer_dirty(bh)) {
+                        BUFFER_TRACE(bh, "needs writeout, adding to array");
+                        wbuf[bufs++] = bh;
+                        __journal_file_buffer(jh, commit_transaction,
+                                                BJ_Locked);
+                        jbd_unlock_bh_state(bh);
+                        if (bufs == journal->j_wbufsize) {
+                                spin_unlock(&journal->j_list_lock);
+                                journal_do_submit_data(wbuf, bufs);
+                                bufs = 0;
+                                goto write_out_data;
+                        }
+                }
+                else {
+                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                        __journal_unfile_buffer(jh);
+                        jbd_unlock_bh_state(bh);
+                        if (locked)
+                                unlock_buffer(bh);
+                        journal_remove_journal_head(bh);
+                        /* Once for our safety reference, once for
+                         * journal_remove_journal_head() */
+                        put_bh(bh);
+                        put_bh(bh);
+                }
+                if (lock_need_resched(&journal->j_list_lock)) {
+                        spin_unlock(&journal->j_list_lock);
+                        goto write_out_data;
+                }
+        }
+        spin_unlock(&journal->j_list_lock);
+        journal_do_submit_data(wbuf, bufs);
+}
 /*
 * journal_commit_transaction
 *
@@ -313,80 +424,13 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
        err = 0;
-        /*
+        journal_submit_data_buffers(journal, commit_transaction);
-         * Whenever we unlock the journal and sleep, things can get added
-         * onto ->t_sync_datalist, so we have to keep looping back to
-         * write_out_data until we *know* that the list is empty.
-         */
-        bufs = 0;
-        /*
-         * Cleanup any flushed data buffers from the data list.  Even in
-         * abort mode, we want to flush this out as soon as possible.
-         */
-write_out_data:
-        cond_resched();
-        spin_lock(&journal->j_list_lock);
-        while (commit_transaction->t_sync_datalist) {
-                struct buffer_head *bh;
-                jh = commit_transaction->t_sync_datalist;
-                commit_transaction->t_sync_datalist = jh->b_tnext;
-                bh = jh2bh(jh);
-                if (buffer_locked(bh)) {
-                        BUFFER_TRACE(bh, "locked");
-                        if (!inverted_lock(journal, bh))
-                                goto write_out_data;
-                        __journal_temp_unlink_buffer(jh);
-                        __journal_file_buffer(jh, commit_transaction,
-                                                BJ_Locked);
-                        jbd_unlock_bh_state(bh);
-                        if (lock_need_resched(&journal->j_list_lock)) {
-                                spin_unlock(&journal->j_list_lock);
-                                goto write_out_data;
-                        }
-                } else {
-                        if (buffer_dirty(bh)) {
-                                BUFFER_TRACE(bh, "start journal writeout");
-                                get_bh(bh);
-                                wbuf[bufs++] = bh;
-                                if (bufs == journal->j_wbufsize) {
-                                        jbd_debug(2, "submit %d writes\n",
-                                                        bufs);
-                                        spin_unlock(&journal->j_list_lock);
-                                        ll_rw_block(SWRITE, bufs, wbuf);
-                                        journal_brelse_array(wbuf, bufs);
-                                        bufs = 0;
-                                        goto write_out_data;
-                                }
-                        } else {
-                                BUFFER_TRACE(bh, "writeout complete: unfile");
-                                if (!inverted_lock(journal, bh))
-                                        goto write_out_data;
-                                __journal_unfile_buffer(jh);
-                                jbd_unlock_bh_state(bh);
-                                journal_remove_journal_head(bh);
-                                put_bh(bh);
-                                if (lock_need_resched(&journal->j_list_lock)) {
-                                        spin_unlock(&journal->j_list_lock);
-                                        goto write_out_data;
-                                }
-                        }
-                }
-        }
-        if (bufs) {
-                spin_unlock(&journal->j_list_lock);
-                ll_rw_block(SWRITE, bufs, wbuf);
-                journal_brelse_array(wbuf, bufs);
-                spin_lock(&journal->j_list_lock);
-        }
        /*
         * Wait for all previously submitted IO to complete.
         */
+        spin_lock(&journal->j_list_lock);
        while (commit_transaction->t_locked_list) {
                struct buffer_head *bh;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f66724ce443a..7af6099c911c 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -181,7 +181,7 @@ loop:
                                                transaction->t_expires))
                        should_sleep = 0;
                if (journal->j_flags & JFS_UNMOUNT)
-                        should_sleep = 0;
+                        should_sleep = 0;
                if (should_sleep) {
                        spin_unlock(&journal->j_state_lock);
                        schedule();
@@ -271,7 +271,7 @@ static void journal_kill_thread(journal_t *journal)
 int journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
                                  struct journal_head **jh_out,
-                                  int blocknr)
+                                  unsigned long blocknr)
 {
        int need_copy_out = 0;
        int done_copy_out = 0;
@@ -578,7 +578,7 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 * ready.
 */
-int journal_bmap(journal_t *journal, unsigned long blocknr, 
+int journal_bmap(journal_t *journal, unsigned long blocknr,
                 unsigned long *retp)
 {
        int err = 0;
@@ -696,13 +696,13 @@ fail:
 *  @bdev: Block device on which to create the journal
 *  @fs_dev: Device which hold journalled filesystem for this journal.
 *  @start: Block nr Start of journal.
- *  @len:  Lenght of the journal in blocks.
+ *  @len:  Length of the journal in blocks.
 *  @blocksize: blocksize of journalling device
 *  @returns: a newly created journal_t *
- *  
+ *
 *  journal_init_dev creates a journal which maps a fixed contiguous
 *  range of blocks on an arbitrary block device.
- * 
+ *
 */
 journal_t * journal_init_dev(struct block_device *bdev,
                        struct block_device *fs_dev,
@@ -715,18 +715,8 @@ journal_t * journal_init_dev(struct block_device *bdev,
        if (!journal)
                return NULL;
-        journal->j_dev = bdev;
-        journal->j_fs_dev = fs_dev;
-        journal->j_blk_offset = start;
-        journal->j_maxlen = len;
-        journal->j_blocksize = blocksize;
-        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
-        journal->j_sb_buffer = bh;
-        journal->j_superblock = (journal_superblock_t *)bh->b_data;
        /* journal descriptor can store up to n blocks -bzzz */
+        journal->j_blocksize = blocksize;
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
@@ -736,14 +726,23 @@ journal_t * journal_init_dev(struct block_device *bdev,
                kfree(journal);
                journal = NULL;
        }
+        journal->j_dev = bdev;
+        journal->j_fs_dev = fs_dev;
+        journal->j_blk_offset = start;
+        journal->j_maxlen = len;
+        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+        J_ASSERT(bh != NULL);
+        journal->j_sb_buffer = bh;
+        journal->j_superblock = (journal_superblock_t *)bh->b_data;
        return journal;
 }
- 
-/** 
+/**
 *  journal_t * journal_init_inode () - creates a journal which maps to a inode.
 *  @inode: An inode to create the journal in
- *  
+ *
 * journal_init_inode creates a journal which maps an on-disk inode as
 * the journal.  The inode must exist already, must support bmap() and
 * must have all data blocks preallocated.
@@ -763,7 +762,7 @@ journal_t * journal_init_inode (struct inode *inode)
        journal->j_inode = inode;
        jbd_debug(1,
                  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
-                  journal, inode->i_sb->s_id, inode->i_ino, 
+                  journal, inode->i_sb->s_id, inode->i_ino,
                  (long long) inode->i_size,
                  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
@@ -798,10 +797,10 @@ journal_t * journal_init_inode (struct inode *inode)
        return journal;
 }
-/* 
+/*
 * If the journal init or create aborts, we need to mark the journal
 * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock. 
+ * back a bogus superblock.
 */
 static void journal_fail_superblock (journal_t *journal)
 {
@@ -820,7 +819,7 @@ static void journal_fail_superblock (journal_t *journal)
 static int journal_reset(journal_t *journal)
 {
        journal_superblock_t *sb = journal->j_superblock;
-        unsigned int first, last;
+        unsigned long first, last;
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
@@ -844,13 +843,13 @@ static int journal_reset(journal_t *journal)
        return 0;
 }
-/** 
+/**
 * int journal_create() - Initialise the new journal file
 * @journal: Journal to create. This structure must have been initialised
- * 
+ *
 * Given a journal_t structure which tells us which disk blocks we can
 * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.  
+ * journal fields from scratch.
 **/
 int journal_create(journal_t *journal)
 {
@@ -915,7 +914,7 @@ int journal_create(journal_t *journal)
        return journal_reset(journal);
 }
-/** 
+/**
 * void journal_update_superblock() - Update journal sb on disk.
 * @journal: The journal to update.
 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -939,7 +938,7 @@ void journal_update_superblock(journal_t *journal, int wait)
                                journal->j_transaction_sequence) {
                jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
                        "(start %ld, seq %d, errno %d)\n",
-                        journal->j_tail, journal->j_tail_sequence, 
+                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
                goto out;
        }
@@ -1062,7 +1061,7 @@ static int load_superblock(journal_t *journal)
 /**
 * int journal_load() - Read journal from disk.
 * @journal: Journal to act on.
- * 
+ *
 * Given a journal_t structure which tells us which disk blocks contain
 * a journal, read the journal from disk to initialise the in-memory
 * structures.
@@ -1094,7 +1093,7 @@ int journal_load(journal_t *journal)
        /*
         * Create a slab for this blocksize
         */
-        err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize));
+        err = journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
        if (err)
                return err;
@@ -1172,9 +1171,9 @@ void journal_destroy(journal_t *journal)
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
- * 
+ *
 * Check whether the journal uses all of a given set of
- * features.  Return true (non-zero) if it does. 
+ * features.  Return true (non-zero) if it does.
 **/
 int journal_check_used_features (journal_t *journal, unsigned long compat,
@@ -1203,7 +1202,7 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
- * 
+ *
 * Check whether the journaling code supports the use of
 * all of a given set of features on this journal.  Return true
 * (non-zero) if it can. */
@@ -1241,7 +1240,7 @@ int journal_check_available_features (journal_t *journal, unsigned long compat,
 * @incompat: bitmask of incompatible features
 *
 * Mark a given journal feature as present on the
- * superblock.  Returns true if the requested features could be set. 
+ * superblock.  Returns true if the requested features could be set.
 *
 */
@@ -1327,7 +1326,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
 /**
 * int journal_flush () - Flush journal
 * @journal: Journal to act on.
- * 
+ *
 * Flush all data for a given journal to disk and empty the journal.
 * Filesystems can use this when remounting readonly to ensure that
 * recovery does not need to happen on remount.
@@ -1394,7 +1393,7 @@ int journal_flush(journal_t *journal)
 * int journal_wipe() - Wipe journal contents
 * @journal: Journal to act on.
 * @write: flag (see below)
- * 
+ *
 * Wipe out all of the contents of a journal, safely.  This will produce
 * a warning if the journal contains any valid recovery information.
 * Must be called between journal_init_*() and journal_load().
@@ -1449,7 +1448,7 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
 /*
 * Journal abort has very specific semantics, which we describe
- * for journal abort. 
+ * for journal abort.
 *
 * Two internal function, which provide abort to te jbd layer
 * itself are here.
@@ -1504,7 +1503,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
 * Perform a complete, immediate shutdown of the ENTIRE
 * journal (not of a single transaction).  This operation cannot be
 * undone without closing and reopening the journal.
- *           
+ *
 * The journal_abort function is intended to support higher level error
 * recovery mechanisms such as the ext2/ext3 remount-readonly error
 * mode.
@@ -1538,7 +1537,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
 * supply an errno; a null errno implies that absolutely no further
 * writes are done to the journal (unless there are any already in
 * progress).
- * 
+ *
 */
 void journal_abort(journal_t *journal, int errno)
@@ -1546,7 +1545,7 @@ void journal_abort(journal_t *journal, int errno)
        __journal_abort_soft(journal, errno);
 }
-/** 
+/**
 * int journal_errno () - returns the journal's error state.
 * @journal: journal to examine.
 *
@@ -1570,7 +1569,7 @@ int journal_errno(journal_t *journal)
        return err;
 }
-/** 
+/**
 * int journal_clear_err () - clears the journal's error state
 * @journal: journal to act on.
 *
@@ -1590,7 +1589,7 @@ int journal_clear_err(journal_t *journal)
        return err;
 }
-/** 
+/**
 * void journal_ack_err() - Ack journal err.
 * @journal: journal to act on.
 *
@@ -1612,7 +1611,7 @@ int journal_blocks_per_page(struct inode *inode)
 /*
 * Simple support for retrying memory allocations.  Introduced to help to
- * debug different VM deadlock avoidance strategies. 
+ * debug different VM deadlock avoidance strategies.
 */
 void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
 {
@@ -2047,13 +2046,7 @@ static int __init journal_init(void)
 {
        int ret;
-/* Static check for data structure consistency.  There's no code
+        BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
- * invoked --- we'll just get a linker failure if things aren't right.
- */
-        extern void journal_bad_superblock_size(void);
-        if (sizeof(struct journal_superblock_s) != 1024)
-                journal_bad_superblock_size();
        ret = journal_init_caches();
        if (ret != 0)
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index de5bafb4e853..11563fe2a52b 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -1,6 +1,6 @@
 /*
 * linux/fs/recovery.c
- * 
+ *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
@@ -10,7 +10,7 @@
 * option, any later version, incorporated herein by reference.
 *
 * Journal recovery routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.  
+ * part of the ext2fs journaling system.
 */
 #ifndef __KERNEL__
@@ -25,9 +25,9 @@
 /*
 * Maintain information about the progress of the recovery job, so that
- * the different passes can carry information between them. 
+ * the different passes can carry information between them.
 */
-struct recovery_info 
+struct recovery_info
 {
        tid_t           start_transaction;
        tid_t           end_transaction;
@@ -46,7 +46,7 @@ static int scan_revoke_records(journal_t *, struct buffer_head *,
 #ifdef __KERNEL__
 /* Release readahead buffers after use */
-void journal_brelse_array(struct buffer_head *b[], int n)
+static void journal_brelse_array(struct buffer_head *b[], int n)
 {
        while (--n >= 0)
                brelse (b[n]);
@@ -116,7 +116,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
        err = 0;
 failed:
-        if (nbufs) 
+        if (nbufs)
                journal_brelse_array(bufs, nbufs);
        return err;
 }
@@ -128,7 +128,7 @@ failed:
 * Read a block from the journal
 */
-static int jread(struct buffer_head **bhp, journal_t *journal, 
+static int jread(struct buffer_head **bhp, journal_t *journal,
                 unsigned int offset)
 {
        int err;
@@ -212,14 +212,14 @@ do {									\
 /**
 * journal_recover - recovers a on-disk journal
 * @journal: the journal to recover
- * 
+ *
 * The primary function for recovering the log contents when mounting a
- * journaled device.  
+ * journaled device.
 *
 * Recovery is done in three passes.  In the first pass, we look for the
 * end of the log.  In the second, we assemble the list of revoke
 * blocks.  In the third and final pass, we replay any un-revoked blocks
- * in the log.  
+ * in the log.
 */
 int journal_recover(journal_t *journal)
 {
@@ -231,10 +231,10 @@ int journal_recover(journal_t *journal)
        memset(&info, 0, sizeof(info));
        sb = journal->j_superblock;
-        /* 
+        /*
         * The journal superblock's s_start field (the current log head)
         * is always zero if, and only if, the journal was cleanly
-         * unmounted.  
+         * unmounted.
         */
        if (!sb->s_start) {
@@ -253,7 +253,7 @@ int journal_recover(journal_t *journal)
        jbd_debug(0, "JBD: recovery, exit status %d, "
                  "recovered transactions %u to %u\n",
                  err, info.start_transaction, info.end_transaction);
-        jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", 
+        jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
                  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
        /* Restart the log at the next transaction ID, thus invalidating
@@ -268,15 +268,15 @@ int journal_recover(journal_t *journal)
 /**
 * journal_skip_recovery - Start journal and wipe exiting records
 * @journal: journal to startup
- * 
+ *
 * Locate any valid recovery information from the journal and set up the
 * journal structures in memory to ignore it (presumably because the
- * caller has evidence that it is out of date).  
+ * caller has evidence that it is out of date).
 * This function does'nt appear to be exorted..
 *
 * We perform one pass over the journal to allow us to tell the user how
 * much recovery information is being erased, and to let us initialise
- * the journal transaction sequence numbers to the next unused ID. 
+ * the journal transaction sequence numbers to the next unused ID.
 */
 int journal_skip_recovery(journal_t *journal)
 {
@@ -297,7 +297,7 @@ int journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD_DEBUG
                int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
 #endif
-                jbd_debug(0, 
+                jbd_debug(0,
                          "JBD: ignoring %d transaction%s from the journal.\n",
                          dropped, (dropped == 1) ? "" : "s");
                journal->j_transaction_sequence = ++info.end_transaction;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
        unsigned long           next_log_block;
        int                     err, success = 0;
        journal_superblock_t *  sb;
-        journal_header_t *      tmp;
+        journal_header_t *      tmp;
        struct buffer_head *    bh;
        unsigned int            sequence;
        int                     blocktype;
@@ -324,10 +324,10 @@ static int do_one_pass(journal_t *journal,
        MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
                               / sizeof(journal_block_tag_t));
-        /* 
+        /*
         * First thing is to establish what we expect to find in the log
         * (in terms of transaction IDs), and where (in terms of log
-         * block offsets): query the superblock.  
+         * block offsets): query the superblock.
         */
        sb = journal->j_superblock;
@@ -344,7 +344,7 @@ static int do_one_pass(journal_t *journal,
         * Now we walk through the log, transaction by transaction,
         * making sure that each transaction has a commit block in the
         * expected place.  Each complete transaction gets replayed back
-         * into the main filesystem. 
+         * into the main filesystem.
         */
        while (1) {
@@ -379,8 +379,8 @@ static int do_one_pass(journal_t *journal,
                next_log_block++;
                wrap(journal, next_log_block);
-                /* What kind of buffer is it? 
+                /* What kind of buffer is it?
-                 * 
+                 *
                 * If it is a descriptor block, check that it has the
                 * expected sequence number.  Otherwise, we're all done
                 * here. */
@@ -394,7 +394,7 @@ static int do_one_pass(journal_t *journal,
                blocktype = be32_to_cpu(tmp->h_blocktype);
                sequence = be32_to_cpu(tmp->h_sequence);
-                jbd_debug(3, "Found magic %d, sequence %d\n", 
+                jbd_debug(3, "Found magic %d, sequence %d\n",
                          blocktype, sequence);
                if (sequence != next_commit_ID) {
@@ -438,7 +438,7 @@ static int do_one_pass(journal_t *journal,
                                        /* Recover what we can, but
                                         * report failure at the end. */
                                        success = err;
-                                        printk (KERN_ERR 
+                                        printk (KERN_ERR
                                                "JBD: IO error %d recovering "
                                                "block %ld in log\n",
                                                err, io_block);
@@ -452,7 +452,7 @@ static int do_one_pass(journal_t *journal,
                                         * revoked, then we're all done
                                         * here. */
                                        if (journal_test_revoke
-                                            (journal, blocknr, 
+                                            (journal, blocknr,
                                             next_commit_ID)) {
                                                brelse(obh);
                                                ++info->nr_revoke_hits;
@@ -465,7 +465,7 @@ static int do_one_pass(journal_t *journal,
                                                        blocknr,
                                                        journal->j_blocksize);
                                        if (nbh == NULL) {
-                                                printk(KERN_ERR 
+                                                printk(KERN_ERR
                                                       "JBD: Out of memory "
                                                       "during recovery.\n");
                                                err = -ENOMEM;
@@ -537,7 +537,7 @@ static int do_one_pass(journal_t *journal,
        }
 done:
-        /* 
+        /*
         * We broke out of the log scan loop: either we came to the
         * known end of the log or we found an unexpected block in the
         * log.  If the latter happened, then we know that the "current"
@@ -567,7 +567,7 @@ static int do_one_pass(journal_t *journal,
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
                               tid_t sequence, struct recovery_info *info)
 {
        journal_revoke_header_t *header;
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index a56144183462..c532429d8d9b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -1,6 +1,6 @@
 /*
 * linux/fs/revoke.c
- * 
+ *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
 *
 * Copyright 2000 Red Hat corp --- All Rights Reserved
@@ -15,10 +15,10 @@
 * Revoke is the mechanism used to prevent old log records for deleted
 * metadata from being replayed on top of newer data using the same
 * blocks.  The revoke mechanism is used in two separate places:
- * 
+ *
 * + Commit: during commit we write the entire list of the current
 *   transaction's revoked blocks to the journal
- * 
+ *
 * + Recovery: during recovery we record the transaction ID of all
 *   revoked blocks.  If there are multiple revoke records in the log
 *   for a single block, only the last one counts, and if there is a log
@@ -29,7 +29,7 @@
 * single transaction:
 *
 * Block is revoked and then journaled:
- *   The desired end result is the journaling of the new block, so we 
+ *   The desired end result is the journaling of the new block, so we
 *   cancel the revoke before the transaction commits.
 *
 * Block is journaled and then revoked:
@@ -41,7 +41,7 @@
 *   transaction must have happened after the block was journaled and so
 *   the revoke must take precedence.
 *
- * Block is revoked and then written as data: 
+ * Block is revoked and then written as data:
 *   The data write is allowed to succeed, but the revoke is _not_
 *   cancelled.  We still need to prevent old log records from
 *   overwriting the new data.  We don't even need to clear the revoke
@@ -54,7 +54,7 @@
 *                      buffer has not been revoked, and cancel_revoke
 *                      need do nothing.
 * RevokeValid set, Revoked set:
- *                      buffer has been revoked.  
+ *                      buffer has been revoked.
 */
 #ifndef __KERNEL__
@@ -77,7 +77,7 @@ static kmem_cache_t *revoke_table_cache;
   journal replay, this involves recording the transaction ID of the
   last transaction to revoke this block. */
-struct jbd_revoke_record_s 
+struct jbd_revoke_record_s
 {
        struct list_head  hash;
        tid_t             sequence;     /* Used for recovery only */
@@ -90,8 +90,8 @@ struct jbd_revoke_table_s
 {
        /* It is conceivable that we might want a larger hash table
         * for recovery.  Must be a power of two. */
-        int               hash_size; 
+        int               hash_size;
-        int               hash_shift; 
+        int               hash_shift;
        struct list_head *hash_table;
 };
@@ -301,22 +301,22 @@ void journal_destroy_revoke(journal_t *journal)
 #ifdef __KERNEL__
-/* 
+/*
 * journal_revoke: revoke a given buffer_head from the journal.  This
 * prevents the block from being replayed during recovery if we take a
 * crash after this current transaction commits.  Any subsequent
 * metadata writes of the buffer in this transaction cancel the
- * revoke.  
+ * revoke.
 *
 * Note that this call may block --- it is up to the caller to make
 * sure that there are no further calls to journal_write_metadata
 * before the revoke is complete.  In ext3, this implies calling the
 * revoke before clearing the block bitmap when we are deleting
- * metadata. 
+ * metadata.
 *
 * Revoke performs a journal_forget on any buffer_head passed in as a
 * parameter, but does _not_ forget the buffer_head if the bh was only
- * found implicitly. 
+ * found implicitly.
 *
 * bh_in may not be a journalled buffer - it may have come off
 * the hash tables without an attached journal_head.
@@ -325,7 +325,7 @@ void journal_destroy_revoke(journal_t *journal)
 * by one.
 */
-int journal_revoke(handle_t *handle, unsigned long blocknr, 
+int journal_revoke(handle_t *handle, unsigned long blocknr,
                   struct buffer_head *bh_in)
 {
        struct buffer_head *bh = NULL;
@@ -487,7 +487,7 @@ void journal_switch_revoke_table(journal_t *journal)
        else
                journal->j_revoke = journal->j_revoke_table[0];
-        for (i = 0; i < journal->j_revoke->hash_size; i++) 
+        for (i = 0; i < journal->j_revoke->hash_size; i++)
                INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
 }
@@ -498,7 +498,7 @@ void journal_switch_revoke_table(journal_t *journal)
 * Called with the journal lock held.
 */
-void journal_write_revoke_records(journal_t *journal, 
+void journal_write_revoke_records(journal_t *journal,
                                  transaction_t *transaction)
 {
        struct journal_head *descriptor;
@@ -507,7 +507,7 @@ void journal_write_revoke_records(journal_t *journal,
        struct list_head *hash_list;
        int i, offset, count;
-        descriptor = NULL; 
+        descriptor = NULL;
        offset = 0;
        count = 0;
@@ -519,10 +519,10 @@ void journal_write_revoke_records(journal_t *journal,
                hash_list = &revoke->hash_table[i];
                while (!list_empty(hash_list)) {
-                        record = (struct jbd_revoke_record_s *) 
+                        record = (struct jbd_revoke_record_s *)
                                hash_list->next;
                        write_one_revoke_record(journal, transaction,
-                                                &descriptor, &offset, 
+                                                &descriptor, &offset,
                                                record);
                        count++;
                        list_del(&record->hash);
@@ -534,14 +534,14 @@ void journal_write_revoke_records(journal_t *journal,
        jbd_debug(1, "Wrote %d revoke records\n", count);
 }
-/* 
+/*
 * Write out one revoke record.  We need to create a new descriptor
- * block if the old one is full or if we have not already created one.  
+ * block if the old one is full or if we have not already created one.
 */
-static void write_one_revoke_record(journal_t *journal, 
+static void write_one_revoke_record(journal_t *journal,
                                    transaction_t *transaction,
-                                    struct journal_head **descriptorp, 
+                                    struct journal_head **descriptorp,
                                    int *offsetp,
                                    struct jbd_revoke_record_s *record)
 {
@@ -584,21 +584,21 @@ static void write_one_revoke_record(journal_t *journal,
                *descriptorp = descriptor;
        }
-        * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 
+        * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
                cpu_to_be32(record->blocknr);
        offset += 4;
        *offsetp = offset;
 }
-/* 
+/*
 * Flush a revoke descriptor out to the journal.  If we are aborting,
 * this is a noop; otherwise we are generating a buffer which needs to
 * be waited for during commit, so it has to go onto the appropriate
 * journal buffer list.
 */
-static void flush_descriptor(journal_t *journal, 
+static void flush_descriptor(journal_t *journal,
-                             struct journal_head *descriptor, 
+                             struct journal_head *descriptor,
                             int offset)
 {
        journal_revoke_header_t *header;
@@ -618,7 +618,7 @@ static void flush_descriptor(journal_t *journal,
 }
 #endif
-/* 
+/*
 * Revoke support for recovery.
 *
 * Recovery needs to be able to:
@@ -629,7 +629,7 @@ static void flush_descriptor(journal_t *journal,
 *  check whether a given block in a given transaction should be replayed
 *  (ie. has not been revoked by a revoke record in that or a subsequent
 *  transaction)
- * 
+ *
 *  empty the revoke table after recovery.
 */
@@ -637,11 +637,11 @@ static void flush_descriptor(journal_t *journal,
 * First, setting revoke records.  We create a new revoke record for
 * every block ever revoked in the log as we scan it for recovery, and
 * we update the existing records if we find multiple revokes for a
- * single block. 
+ * single block.
 */
-int journal_set_revoke(journal_t *journal, 
+int journal_set_revoke(journal_t *journal,
-                       unsigned long blocknr, 
+                       unsigned long blocknr,
                       tid_t sequence)
 {
        struct jbd_revoke_record_s *record;
@@ -653,18 +653,18 @@ int journal_set_revoke(journal_t *journal,
                if (tid_gt(sequence, record->sequence))
                        record->sequence = sequence;
                return 0;
-        } 
+        }
        return insert_revoke_hash(journal, blocknr, sequence);
 }
-/* 
+/*
 * Test revoke records.  For a given block referenced in the log, has
 * that block been revoked?  A revoke record with a given transaction
 * sequence number revokes all blocks in that transaction and earlier
 * ones, but later transactions still need replayed.
 */
-int journal_test_revoke(journal_t *journal, 
+int journal_test_revoke(journal_t *journal,
                        unsigned long blocknr,
                        tid_t sequence)
 {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index f5169a96260e..e1b3c8af4d17 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1,6 +1,6 @@
 /*
 * linux/fs/transaction.c
- * 
+ *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
 *
 * Copyright 1998 Red Hat corp --- All Rights Reserved
@@ -10,7 +10,7 @@
 * option, any later version, incorporated herein by reference.
 *
 * Generic filesystem transaction handling code; part of the ext2fs
- * journaling system.  
+ * journaling system.
 *
 * This file manages transactions (compound commits managed by the
 * journaling code) and handles (individual atomic operations by the
@@ -74,7 +74,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
- * transaction's buffer credits.  
+ * transaction's buffer credits.
 */
 static int start_this_handle(journal_t *journal, handle_t *handle)
@@ -117,7 +117,7 @@ repeat_locked:
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
                spin_unlock(&journal->j_state_lock);
-                ret = -EROFS; 
+                ret = -EROFS;
                goto out;
        }
@@ -182,7 +182,7 @@ repeat_locked:
                goto repeat;
        }
-        /* 
+        /*
         * The commit code assumes that it can get enough log space
         * without forcing a checkpoint.  This is *critical* for
         * correctness: a checkpoint of a buffer which is also
@@ -191,7 +191,7 @@ repeat_locked:
         *
         * We must therefore ensure the necessary space in the journal
         * *before* starting to dirty potentially checkpointed buffers
-         * in the new transaction. 
+         * in the new transaction.
         *
         * The worst part is, any transaction currently committing can
         * reduce the free space arbitrarily.  Be careful to account for
@@ -246,13 +246,13 @@ static handle_t *new_handle(int nblocks)
 }
 /**
- * handle_t *journal_start() - Obtain a new handle.  
+ * handle_t *journal_start() - Obtain a new handle.
 * @journal: Journal to start transaction on.
 * @nblocks: number of block buffer we might modify
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
- * that much space.  
+ * that much space.
 *
 * This function is visible to journal users (like ext3fs), so is not
 * called with the journal already locked.
@@ -292,11 +292,11 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 * int journal_extend() - extend buffer credits.
 * @handle:  handle to 'extend'
 * @nblocks: nr blocks to try to extend by.
- * 
+ *
 * Some transactions, such as large extends and truncates, can be done
 * atomically all at once or in several stages.  The operation requests
 * a credit for a number of buffer modications in advance, but can
- * extend its credit if it needs more.  
+ * extend its credit if it needs more.
 *
 * journal_extend tries to give the running handle more buffer credits.
 * It does not guarantee that allocation - this is a best-effort only.
@@ -363,7 +363,7 @@ out:
 * int journal_restart() - restart a handle .
 * @handle:  handle to restart
 * @nblocks: nr credits requested
- * 
+ *
 * Restart a handle for a multi-transaction filesystem
 * operation.
 *
@@ -462,7 +462,7 @@ void journal_lock_updates(journal_t *journal)
 /**
 * void journal_unlock_updates (journal_t* journal) - release barrier
 * @journal:  Journal to release the barrier on.
- * 
+ *
 * Release a transaction barrier obtained with journal_lock_updates().
 *
 * Should be called without the journal lock held.
@@ -547,8 +547,8 @@ repeat:
        jbd_lock_bh_state(bh);
        /* We now hold the buffer lock so it is safe to query the buffer
-         * state.  Is the buffer dirty? 
+         * state.  Is the buffer dirty?
-         * 
+         *
         * If so, there are two possibilities.  The buffer may be
         * non-journaled, and undergoing a quite legitimate writeback.
         * Otherwise, it is journaled, and we don't expect dirty buffers
@@ -566,7 +566,7 @@ repeat:
                 */
                if (jh->b_transaction) {
                        J_ASSERT_JH(jh,
-                                jh->b_transaction == transaction || 
+                                jh->b_transaction == transaction ||
                                jh->b_transaction ==
                                        journal->j_committing_transaction);
                        if (jh->b_next_transaction)
@@ -580,7 +580,7 @@ repeat:
                 */
                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
                jbd_unexpected_dirty_buffer(jh);
-        }
+        }
        unlock_buffer(bh);
@@ -653,7 +653,7 @@ repeat:
                 * buffer had better remain locked during the kmalloc,
                 * but that should be true --- we hold the journal lock
                 * still and the buffer is already on the BUF_JOURNAL
-                 * list so won't be flushed. 
+                 * list so won't be flushed.
                 *
                 * Subtle point, though: if this is a get_undo_access,
                 * then we will be relying on the frozen_data to contain
@@ -765,8 +765,8 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 * manually rather than reading off disk), then we need to keep the
 * buffer_head locked until it has been completely filled with new
 * data.  In this case, we should be able to make the assertion that
- * the bh is not already part of an existing transaction.  
+ * the bh is not already part of an existing transaction.
- * 
+ *
 * The buffer should already be locked by the caller by this point.
 * There is no lock ranking violation: it was a newly created,
 * unlocked buffer beforehand. */
@@ -778,7 +778,7 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 *
 * Call this if you create a new bh.
 */
-int journal_get_create_access(handle_t *handle, struct buffer_head *bh) 
+int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
@@ -847,13 +847,13 @@ out:
 * do not reuse freed space until the deallocation has been committed,
 * since if we overwrote that space we would make the delete
 * un-rewindable in case of a crash.
- * 
+ *
 * To deal with that, journal_get_undo_access requests write access to a
 * buffer for parts of non-rewindable operations such as delete
 * operations on the bitmaps.  The journaling code must keep a copy of
 * the buffer's contents prior to the undo_access call until such time
 * as we know that the buffer has definitely been committed to disk.
- * 
+ *
 * We never need to know which transaction the committed data is part
 * of, buffers touched here are guaranteed to be dirtied later and so
 * will be committed to a new transaction in due course, at which point
@@ -911,13 +911,13 @@ out:
        return err;
 }
-/** 
+/**
 * int journal_dirty_data() -  mark a buffer as containing dirty data which
 *                             needs to be flushed before we can commit the
- *                             current transaction.  
+ *                             current transaction.
 * @handle: transaction
 * @bh: bufferhead to mark
- * 
+ *
 * The buffer is placed on the transaction's data list and is marked as
 * belonging to the transaction.
 *
@@ -946,15 +946,15 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
        /*
         * What if the buffer is already part of a running transaction?
-         * 
+         *
         * There are two cases:
         * 1) It is part of the current running transaction.  Refile it,
         *    just in case we have allocated it as metadata, deallocated
-         *    it, then reallocated it as data. 
+         *    it, then reallocated it as data.
         * 2) It is part of the previous, still-committing transaction.
         *    If all we want to do is to guarantee that the buffer will be
         *    written to disk before this new transaction commits, then
-         *    being sure that the *previous* transaction has this same 
+         *    being sure that the *previous* transaction has this same
         *    property is sufficient for us!  Just leave it on its old
         *    transaction.
         *
@@ -1076,18 +1076,18 @@ no_journal:
        return 0;
 }
-/** 
+/**
 * int journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
- * @bh: buffer to mark 
+ * @bh: buffer to mark
- * 
+ *
 * mark dirty metadata which needs to be journaled as part of the current
 * transaction.
 *
 * The buffer is placed on the transaction's metadata list and is marked
- * as belonging to the transaction.  
+ * as belonging to the transaction.
 *
- * Returns error number or 0 on success.  
+ * Returns error number or 0 on success.
 *
 * Special care needs to be taken if the buffer already belongs to the
 * current committing transaction (in which case we should have frozen
@@ -1135,11 +1135,11 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        set_buffer_jbddirty(bh);
-        /* 
+        /*
         * Metadata already on the current transaction list doesn't
         * need to be filed.  Metadata on another transaction's list must
         * be committing, and will be refiled once the commit completes:
-         * leave it alone for now. 
+         * leave it alone for now.
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
@@ -1165,7 +1165,7 @@ out:
        return 0;
 }
-/* 
+/*
 * journal_release_buffer: undo a get_write_access without any buffer
 * updates, if the update decided in the end that it didn't need access.
 *
@@ -1176,20 +1176,20 @@ journal_release_buffer(handle_t *handle, struct buffer_head *bh)
        BUFFER_TRACE(bh, "entry");
 }
-/** 
+/**
 * void journal_forget() - bforget() for potentially-journaled buffers.
 * @handle: transaction handle
 * @bh:     bh to 'forget'
 *
 * We can only do the bforget if there are no commits pending against the
 * buffer.  If the buffer is dirty in the current running transaction we
- * can safely unlink it. 
+ * can safely unlink it.
 *
 * bh may not be a journalled buffer at all - it may be a non-JBD
 * buffer which came off the hashtable.  Check for this.
 *
 * Decrements bh->b_count by one.
- * 
+ *
 * Allow this call even if the handle has aborted --- it may be part of
 * the caller's cleanup after an abort.
 */
@@ -1237,7 +1237,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                drop_reserve = 1;
-                /* 
+                /*
                 * We are no longer going to journal this buffer.
                 * However, the commit of this transaction is still
                 * important to the buffer: the delete that we are now
@@ -1246,7 +1246,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                 *
                 * So, if we have a checkpoint on the buffer, we should
                 * now refile the buffer on our BJ_Forget list so that
-                 * we know to remove the checkpoint after we commit. 
+                 * we know to remove the checkpoint after we commit.
                 */
                if (jh->b_cp_transaction) {
@@ -1264,7 +1264,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                        }
                }
        } else if (jh->b_transaction) {
-                J_ASSERT_JH(jh, (jh->b_transaction == 
+                J_ASSERT_JH(jh, (jh->b_transaction ==
                                 journal->j_committing_transaction));
                /* However, if the buffer is still owned by a prior
                 * (committing) transaction, we can't drop it yet... */
@@ -1294,7 +1294,7 @@ drop:
 /**
 * int journal_stop() - complete a transaction
 * @handle: tranaction to complete.
- * 
+ *
 * All done for a particular handle.
 *
 * There is not much action needed here.  We just return any remaining
@@ -1303,7 +1303,7 @@ drop:
 * filesystem is marked for synchronous update.
 *
 * journal_stop itself will not usually return an error, but it may
- * do so in unusual circumstances.  In particular, expect it to 
+ * do so in unusual circumstances.  In particular, expect it to
 * return -EIO if a journal_abort has been executed since the
 * transaction began.
 */
@@ -1373,7 +1373,7 @@ int journal_stop(handle_t *handle)
        if (handle->h_sync ||
                        transaction->t_outstanding_credits >
                                journal->j_max_transaction_buffers ||
-                        time_after_eq(jiffies, transaction->t_expires)) {
+                        time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                 * anything to disk. */
@@ -1388,7 +1388,7 @@ int journal_stop(handle_t *handle)
                /*
                 * Special case: JFS_SYNC synchronous updates require us
-                 * to wait for the commit to complete.  
+                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
                        err = log_wait_commit(journal, tid);
@@ -1439,7 +1439,7 @@ int journal_force_commit(journal_t *journal)
 * jbd_lock_bh_state(jh2bh(jh)) is held.
 */
-static inline void 
+static inline void
 __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
 {
        if (!*list) {
@@ -1454,7 +1454,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
        }
 }
-/* 
+/*
 * Remove a buffer from a transaction list, given the transaction's list
 * head pointer.
 *
@@ -1475,7 +1475,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
        jh->b_tnext->b_tprev = jh->b_tprev;
 }
-/* 
+/*
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
@@ -1595,17 +1595,17 @@ out:
 }
-/** 
+/**
 * int journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @page: to try and free
 * @unused_gfp_mask: unused
 *
- * 
+ *
 * For all the buffers on this page,
 * if they are fully written out ordered data, move them onto BUF_CLEAN
 * so try_to_free_buffers() can reap them.
- * 
+ *
 * This function returns non-zero if we wish try_to_free_buffers()
 * to be called. We do this if the page is releasable by try_to_free_buffers().
 * We also do it if the page has locked or dirty buffers and the caller wants
@@ -1629,7 +1629,7 @@ out:
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
 */
-int journal_try_to_free_buffers(journal_t *journal, 
+int journal_try_to_free_buffers(journal_t *journal,
                                struct page *page, gfp_t unused_gfp_mask)
 {
        struct buffer_head *head;
@@ -1697,7 +1697,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 }
 /*
- * journal_invalidatepage 
+ * journal_invalidatepage
 *
 * This code is tricky.  It has a number of cases to deal with.
 *
@@ -1705,15 +1705,15 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 *
 * i_size must be updated on disk before we start calling invalidatepage on the
 * data.
- * 
+ *
 *  This is done in ext3 by defining an ext3_setattr method which
 *  updates i_size before truncate gets going.  By maintaining this
 *  invariant, we can be sure that it is safe to throw away any buffers
 *  attached to the current transaction: once the transaction commits,
 *  we know that the data will not be needed.
- * 
+ *
 *  Note however that we can *not* throw away data belonging to the
- *  previous, committing transaction!  
+ *  previous, committing transaction!
 *
 * Any disk blocks which *are* part of the previous, committing
 * transaction (and which therefore cannot be discarded immediately) are
@@ -1732,7 +1732,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 * don't make guarantees about the order in which data hits disk --- in
 * particular we don't guarantee that new dirty data is flushed before
 * transaction commit --- so it is always safe just to discard data
- * immediately in that mode.  --sct 
+ * immediately in that mode.  --sct
 */
 /*
@@ -1876,9 +1876,9 @@ zap_buffer_unlocked:
        return may_free;
 }
-/** 
+/**
 * void journal_invalidatepage()
- * @journal: journal to use for flush... 
+ * @journal: journal to use for flush...
 * @page:    page to flush
 * @offset:  length of page to invalidate.
 *
@@ -1886,7 +1886,7 @@ zap_buffer_unlocked:
 *
 */
 void journal_invalidatepage(journal_t *journal,
-                      struct page *page, 
+                      struct page *page,
                      unsigned long offset)
 {
        struct buffer_head *head, *bh, *next;
@@ -1908,7 +1908,7 @@ void journal_invalidatepage(journal_t *journal,
                next = bh->b_this_page;
                if (offset <= curr_off) {
-                        /* This block is wholly outside the truncation point */
+                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
                        may_free &= journal_unmap_buffer(journal, bh);
                        unlock_buffer(bh);
@@ -1924,8 +1924,8 @@ void journal_invalidatepage(journal_t *journal,
        }
 }
-/* 
+/*
- * File a buffer on the given transaction list. 
+ * File a buffer on the given transaction list.
 */
 void __journal_file_buffer(struct journal_head *jh,
                        transaction_t *transaction, int jlist)
@@ -1948,7 +1948,7 @@ void __journal_file_buffer(struct journal_head *jh,
         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
         * state. */
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved || 
+        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
@@ -2008,7 +2008,7 @@ void journal_file_buffer(struct journal_head *jh,
        jbd_unlock_bh_state(jh2bh(jh));
 }
-/* 
+/*
 * Remove a buffer from its current buffer list in preparation for
 * dropping it from its current transaction entirely.  If the buffer has
 * already started to be used by a subsequent transaction, refile the
@@ -2060,7 +2060,7 @@ void __journal_refile_buffer(struct journal_head *jh)
 * to the caller to remove the journal_head if necessary.  For the
 * unlocked journal_refile_buffer call, the caller isn't going to be
 * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak. 
+ * ourselves to avoid a jh leak.
 *
 * *** The journal_head may be freed by this call! ***
 */
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 93068697a9bf..f5cf9c93e243 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -364,12 +364,11 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
        inode->i_ctime.tv_nsec = 0;
        inode->i_mtime.tv_nsec = 0;
        inode->i_atime.tv_nsec = 0;
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks = (inode->i_size + 511) >> 9;
        f = jffs_find_file(c, raw_inode->ino);
-        inode->u.generic_ip = (void *)f;
+        inode->i_private = (void *)f;
        insert_inode_hash(inode);
        return inode;
@@ -442,7 +441,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
        });
        result = -ENOTDIR;
-        if (!(old_dir_f = (struct jffs_file *)old_dir->u.generic_ip)) {
+        if (!(old_dir_f = old_dir->i_private)) {
                D(printk("jffs_rename(): Old dir invalid.\n"));
                goto jffs_rename_end;
        }
@@ -456,7 +455,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Find the new directory.  */
        result = -ENOTDIR;
-        if (!(new_dir_f = (struct jffs_file *)new_dir->u.generic_ip)) {
+        if (!(new_dir_f = new_dir->i_private)) {
                D(printk("jffs_rename(): New dir invalid.\n"));
                goto jffs_rename_end;
        }
@@ -593,7 +592,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                else {
                        ddino = ((struct jffs_file *)
-                                 inode->u.generic_ip)->pino;
+                                 inode->i_private)->pino;
                }
                D3(printk("jffs_readdir(): \"..\" %u\n", ddino));
                if (filldir(dirent, "..", 2, filp->f_pos, ddino, DT_DIR) < 0) {
@@ -604,7 +603,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                filp->f_pos++;
        }
-        f = ((struct jffs_file *)inode->u.generic_ip)->children;
+        f = ((struct jffs_file *)inode->i_private)->children;
        j = 2;
        while(f && (f->deleted || j++ < filp->f_pos )) {
@@ -652,7 +651,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        lock_kernel();
        D3({
-                char *s = (char *)kmalloc(len + 1, GFP_KERNEL);
+                char *s = kmalloc(len + 1, GFP_KERNEL);
                memcpy(s, name, len);
                s[len] = '\0';
                printk("jffs_lookup(): dir: 0x%p, name: \"%s\"\n", dir, s);
@@ -668,7 +667,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        }
        r = -EACCES;
-        if (!(d = (struct jffs_file *)dir->u.generic_ip)) {
+        if (!(d = (struct jffs_file *)dir->i_private)) {
                D(printk("jffs_lookup(): No such inode! (%lu)\n",
                         dir->i_ino));
                goto jffs_lookup_end;
@@ -739,7 +738,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page)
        unsigned long read_len;
        int result;
        struct inode *inode = (struct inode*)page->mapping->host;
-        struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip;
+        struct jffs_file *f = (struct jffs_file *)inode->i_private;
        struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info;
        int r;
        loff_t offset;
@@ -828,7 +827,7 @@ jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        });
        lock_kernel();
-        dir_f = (struct jffs_file *)dir->u.generic_ip;
+        dir_f = dir->i_private;
        ASSERT(if (!dir_f) {
                printk(KERN_ERR "jffs_mkdir(): No reference to a "
@@ -972,7 +971,7 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type)
                kfree(_name);
        });
-        dir_f = (struct jffs_file *) dir->u.generic_ip;
+        dir_f = dir->i_private;
        c = dir_f->c;
        result = -ENOENT;
@@ -1082,7 +1081,7 @@ jffs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        if (!old_valid_dev(rdev))
                return -EINVAL;
        lock_kernel();
-        dir_f = (struct jffs_file *)dir->u.generic_ip;
+        dir_f = dir->i_private;
        c = dir_f->c;
        D3(printk (KERN_NOTICE "mknod(): down biglock\n"));
@@ -1173,8 +1172,8 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
        lock_kernel();
        D1({
                int len = dentry->d_name.len; 
-                char *_name = (char *)kmalloc(len + 1, GFP_KERNEL);
+                char *_name = kmalloc(len + 1, GFP_KERNEL);
-                char *_symname = (char *)kmalloc(symname_len + 1, GFP_KERNEL);
+                char *_symname = kmalloc(symname_len + 1, GFP_KERNEL);
                memcpy(_name, dentry->d_name.name, len);
                _name[len] = '\0';
                memcpy(_symname, symname, symname_len);
@@ -1186,7 +1185,7 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
                kfree(_symname);
        });
-        dir_f = (struct jffs_file *)dir->u.generic_ip;
+        dir_f = dir->i_private;
        ASSERT(if (!dir_f) {
                printk(KERN_ERR "jffs_symlink(): No reference to a "
                       "jffs_file struct in inode.\n");
@@ -1282,14 +1281,14 @@ jffs_create(struct inode *dir, struct dentry *dentry, int mode,
        lock_kernel();
        D1({
                int len = dentry->d_name.len;
-                char *s = (char *)kmalloc(len + 1, GFP_KERNEL);
+                char *s = kmalloc(len + 1, GFP_KERNEL);
                memcpy(s, dentry->d_name.name, len);
                s[len] = '\0';
                printk("jffs_create(): dir: 0x%p, name: \"%s\"\n", dir, s);
                kfree(s);
        });
-        dir_f = (struct jffs_file *)dir->u.generic_ip;
+        dir_f = dir->i_private;
        ASSERT(if (!dir_f) {
                printk(KERN_ERR "jffs_create(): No reference to a "
                       "jffs_file struct in inode.\n");
@@ -1403,9 +1402,9 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
                goto out_isem;
        }
-        if (!(f = (struct jffs_file *)inode->u.generic_ip)) {
+        if (!(f = inode->i_private)) {
-                D(printk("jffs_file_write(): inode->u.generic_ip = 0x%p\n",
+                D(printk("jffs_file_write(): inode->i_private = 0x%p\n",
-                                inode->u.generic_ip));
+                                inode->i_private));
                goto out_isem;
        }
@@ -1693,7 +1692,7 @@ jffs_read_inode(struct inode *inode)
                mutex_unlock(&c->fmc->biglock);
                return;
        }
-        inode->u.generic_ip = (void *)f;
+        inode->i_private = f;
        inode->i_mode = f->mode;
        inode->i_nlink = f->nlink;
        inode->i_uid = f->uid;
@@ -1706,7 +1705,6 @@ jffs_read_inode(struct inode *inode)
        inode->i_mtime.tv_nsec = 
        inode->i_ctime.tv_nsec = 0;
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks = (inode->i_size + 511) >> 9;
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &jffs_file_inode_operations;
@@ -1748,7 +1746,7 @@ jffs_delete_inode(struct inode *inode)
        lock_kernel();
        inode->i_size = 0;
        inode->i_blocks = 0;
-        inode->u.generic_ip = NULL;
+        inode->i_private = NULL;
        clear_inode(inode);
        if (inode->i_nlink == 0) {
                c = (struct jffs_control *) inode->i_sb->s_fs_info;
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 9000f1effedf..4a543e114970 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -488,13 +488,11 @@ jffs_create_file(struct jffs_control *c,
 {
        struct jffs_file *f;
-        if (!(f = (struct jffs_file *)kmalloc(sizeof(struct jffs_file),
+        if (!(f = kzalloc(sizeof(*f), GFP_KERNEL))) {
-                                              GFP_KERNEL))) {
                D(printk("jffs_create_file(): Failed!\n"));
                return NULL;
        }
        no_jffs_file++;
-        memset(f, 0, sizeof(struct jffs_file));
        f->ino = raw_inode->ino;
        f->pino = raw_inode->pino;
        f->nlink = raw_inode->nlink;
@@ -516,7 +514,7 @@ jffs_create_control(struct super_block *sb)
        D2(printk("jffs_create_control()\n"));
-        if (!(c = (struct jffs_control *)kmalloc(s, GFP_KERNEL))) {
+        if (!(c = kmalloc(s, GFP_KERNEL))) {
                goto fail_control;
        }
        DJM(no_jffs_control++);
@@ -524,7 +522,7 @@ jffs_create_control(struct super_block *sb)
        c->gc_task = NULL;
        c->hash_len = JFFS_HASH_SIZE;
        s = sizeof(struct list_head) * c->hash_len;
-        if (!(c->hash = (struct list_head *)kmalloc(s, GFP_KERNEL))) {
+        if (!(c->hash = kmalloc(s, GFP_KERNEL))) {
                goto fail_hash;
        }
        DJM(no_hash++);
@@ -593,8 +591,7 @@ jffs_add_virtual_root(struct jffs_control *c)
        D2(printk("jffs_add_virtual_root(): "
                  "Creating a virtual root directory.\n"));
-        if (!(root = (struct jffs_file *)kmalloc(sizeof(struct jffs_file),
+        if (!(root = kmalloc(sizeof(struct jffs_file), GFP_KERNEL))) {
-                                                 GFP_KERNEL))) {
                return -ENOMEM;
        }
        no_jffs_file++;
diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c
index 7d8ca1aeace2..29b68d939bd9 100644
--- a/fs/jffs/jffs_fm.c
+++ b/fs/jffs/jffs_fm.c
@@ -94,8 +94,7 @@ jffs_build_begin(struct jffs_control *c, int unit)
        struct mtd_info *mtd;
        
        D3(printk("jffs_build_begin()\n"));
-        fmc = (struct jffs_fmcontrol *)kmalloc(sizeof(struct jffs_fmcontrol),
+        fmc = kmalloc(sizeof(*fmc), GFP_KERNEL);
-                                               GFP_KERNEL);
        if (!fmc) {
                D(printk("jffs_build_begin(): Allocation of "
                         "struct jffs_fmcontrol failed!\n"));
@@ -486,8 +485,7 @@ jffs_add_node(struct jffs_node *node)
        D3(printk("jffs_add_node(): ino = %u\n", node->ino));
-        ref = (struct jffs_node_ref *)kmalloc(sizeof(struct jffs_node_ref),
+        ref = kmalloc(sizeof(*ref), GFP_KERNEL);
-                                              GFP_KERNEL);
        if (!ref)
                return -ENOMEM;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4780f82825d6..72d9909d95ff 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -263,7 +263,6 @@ void jffs2_read_inode (struct inode *inode)
        inode->i_nlink = f->inocache->nlink;
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks = (inode->i_size + 511) >> 9;
        switch (inode->i_mode & S_IFMT) {
@@ -449,7 +448,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
        inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
        ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks = 0;
        inode->i_size = 0;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 68e3953419b4..6de374513c01 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -119,10 +119,9 @@ static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
        struct jffs2_sb_info *c;
        int ret;
-        c = kmalloc(sizeof(*c), GFP_KERNEL);
+        c = kzalloc(sizeof(*c), GFP_KERNEL);
        if (!c)
                return -ENOMEM;
-        memset(c, 0, sizeof(*c));
        c->mtd = mtd;
        sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 4d52593a5fc6..4c74f0944f7e 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -468,7 +468,7 @@ int extRecord(struct inode *ip, xad_t * xp)
 int extFill(struct inode *ip, xad_t * xp)
 {
        int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
-        s64 blkno = offsetXAD(xp) >> ip->i_blksize;
+        s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
 //      assert(ISSPARSE(ip));
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index ccbe60aff83d..369d7f39c040 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3115,7 +3115,6 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
        ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
        ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
        ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
-        ip->i_blksize = ip->i_sb->s_blocksize;
        ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
        ip->i_generation = le32_to_cpu(dip->di_gen);
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 495df402916d..bffaca9ae3a2 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -115,7 +115,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        }
        jfs_inode->mode2 |= mode;
-        inode->i_blksize = sb->s_blocksize;
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        jfs_inode->otime = inode->i_ctime.tv_sec;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index e1e0a6e6ebdf..f5afc129d6b1 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -257,7 +257,7 @@ static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
        int rc = 0;
        int xflag;
        s64 xaddr;
-        sector_t file_blocks = (inode->i_size + inode->i_blksize - 1) >>
+        sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                               inode->i_blkbits;
        if (lblock >= file_blocks)
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index efbb586bed4b..3856efc399c1 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -282,7 +282,7 @@ int txInit(void)
        TxLockVHWM = (nTxLock * 8) / 10;
        size = sizeof(struct tblock) * nTxBlock;
-        TxBlock = (struct tblock *) vmalloc(size);
+        TxBlock = vmalloc(size);
        if (TxBlock == NULL)
                return -ENOMEM;
@@ -307,7 +307,7 @@ int txInit(void)
         * tlock id = 0 is reserved.
         */
        size = sizeof(struct tlock) * nTxLock;
-        TxLock = (struct tlock *) vmalloc(size);
+        TxLock = vmalloc(size);
        if (TxLock == NULL) {
                vfree(TxBlock);
                return -ENOMEM;
diff --git a/fs/libfs.c b/fs/libfs.c
index ac02ea602c3d..3793aaa14577 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -317,17 +317,9 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 int simple_readpage(struct file *file, struct page *page)
 {
-        void *kaddr;
+        clear_highpage(page);
-        if (PageUptodate(page))
-                goto out;
-        kaddr = kmap_atomic(page, KM_USER0);
-        memset(kaddr, 0, PAGE_CACHE_SIZE);
-        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        SetPageUptodate(page);
-out:
        unlock_page(page);
        return 0;
 }
@@ -383,7 +375,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
                return -ENOMEM;
        inode->i_mode = S_IFDIR | 0755;
        inode->i_uid = inode->i_gid = 0;
-        inode->i_blksize = PAGE_CACHE_SIZE;
        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &simple_dir_inode_operations;
@@ -405,7 +396,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
                        goto out;
                inode->i_mode = S_IFREG | files->mode;
                inode->i_uid = inode->i_gid = 0;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_fop = files->ops;
@@ -547,7 +537,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
        attr->get = get;
        attr->set = set;
-        attr->data = inode->u.generic_ip;
+        attr->data = inode->i_private;
        attr->fmt = fmt;
        mutex_init(&attr->mutex);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 52774feab93f..f95cc3f3c42d 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -160,7 +160,7 @@ static void nlmclnt_prepare_reclaim(struct nlm_host *host)
         */
        list_splice_init(&host->h_granted, &host->h_reclaim);
-        dprintk("NLM: reclaiming locks for host %s", host->h_name);
+        dprintk("NLM: reclaiming locks for host %s\n", host->h_name);
 }
 static void nlmclnt_finish_reclaim(struct nlm_host *host)
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 89ba0df14c22..271e2165fff6 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -100,7 +100,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_
        res = __nlm_find_lockowner(host, owner);
        if (res == NULL) {
                spin_unlock(&host->h_lock);
-                new = (struct nlm_lockowner *)kmalloc(sizeof(*new), GFP_KERNEL);
+                new = kmalloc(sizeof(*new), GFP_KERNEL);
                spin_lock(&host->h_lock);
                res = __nlm_find_lockowner(host, owner);
                if (res == NULL && new != NULL) {
@@ -151,11 +151,13 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
 int
 nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 {
+        struct rpc_clnt         *client = NFS_CLIENT(inode);
+        struct sockaddr_in      addr;
        struct nlm_host         *host;
        struct nlm_rqst         *call;
        sigset_t                oldset;
        unsigned long           flags;
-        int                     status, proto, vers;
+        int                     status, vers;
        vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1;
        if (NFS_PROTO(inode)->version > 3) {
@@ -163,10 +165,8 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
                return -ENOLCK;
        }
-        /* Retrieve transport protocol from NFS client */
+        rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
-        proto = NFS_CLIENT(inode)->cl_xprt->prot;
+        host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers);
-        host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers);
        if (host == NULL)
                return -ENOLCK;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 38b0e8a1aec0..a0d0b58ce7a4 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -26,7 +26,6 @@
 #define NLM_HOST_REBIND         (60 * HZ)
 #define NLM_HOST_EXPIRE         ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ)
 #define NLM_HOST_COLLECT        ((nrhosts > NLM_HOST_MAX)? 120 * HZ :  60 * HZ)
-#define NLM_HOST_ADDR(sv)       (&(sv)->s_nlmclnt->cl_xprt->addr)
 static struct nlm_host *        nlm_hosts[NLM_HOST_NRHASH];
 static unsigned long            next_gc;
@@ -100,9 +99,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
        /* Ooops, no host found, create it */
        dprintk("lockd: creating host entry\n");
-        if (!(host = (struct nlm_host *) kmalloc(sizeof(*host), GFP_KERNEL)))
+        host = kzalloc(sizeof(*host), GFP_KERNEL);
+        if (!host)
                goto nohost;
-        memset(host, 0, sizeof(*host));
        addr = sin->sin_addr.s_addr;
        sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr));
@@ -167,7 +166,6 @@ struct rpc_clnt *
 nlm_bind_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
-        struct rpc_xprt *xprt;
        dprintk("lockd: nlm_bind_host(%08x)\n",
                        (unsigned)ntohl(host->h_addr.sin_addr.s_addr));
@@ -179,7 +177,6 @@ nlm_bind_host(struct nlm_host *host)
         * RPC rebind is required
         */
        if ((clnt = host->h_rpcclnt) != NULL) {
-                xprt = clnt->cl_xprt;
                if (time_after_eq(jiffies, host->h_nextrebind)) {
                        rpc_force_rebind(clnt);
                        host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -187,31 +184,37 @@ nlm_bind_host(struct nlm_host *host)
                                        host->h_nextrebind - jiffies);
                }
        } else {
-                xprt = xprt_create_proto(host->h_proto, &host->h_addr, NULL);
+                unsigned long increment = nlmsvc_timeout * HZ;
-                if (IS_ERR(xprt))
+                struct rpc_timeout timeparms = {
-                        goto forgetit;
+                        .to_initval     = increment,
+                        .to_increment   = increment,
-                xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout);
+                        .to_maxval      = increment * 6UL,
-                xprt->resvport = 1;     /* NLM requires a reserved port */
+                        .to_retries     = 5U,
+                };
-                /* Existing NLM servers accept AUTH_UNIX only */
+                struct rpc_create_args args = {
-                clnt = rpc_new_client(xprt, host->h_name, &nlm_program,
+                        .protocol       = host->h_proto,
-                                        host->h_version, RPC_AUTH_UNIX);
+                        .address        = (struct sockaddr *)&host->h_addr,
-                if (IS_ERR(clnt))
+                        .addrsize       = sizeof(host->h_addr),
-                        goto forgetit;
+                        .timeout        = &timeparms,
-                clnt->cl_autobind = 1;  /* turn on pmap queries */
+                        .servername     = host->h_name,
-                clnt->cl_softrtry = 1; /* All queries are soft */
+                        .program        = &nlm_program,
+                        .version        = host->h_version,
-                host->h_rpcclnt = clnt;
+                        .authflavor     = RPC_AUTH_UNIX,
+                        .flags          = (RPC_CLNT_CREATE_HARDRTRY |
+                                           RPC_CLNT_CREATE_AUTOBIND),
+                };
+                clnt = rpc_create(&args);
+                if (!IS_ERR(clnt))
+                        host->h_rpcclnt = clnt;
+                else {
+                        printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
+                        clnt = NULL;
+                }
        }
        mutex_unlock(&host->h_mutex);
        return clnt;
-forgetit:
-        printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-        mutex_unlock(&host->h_mutex);
-        return NULL;
 }
 /*
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3fc683f46b3e..5954dcb497e4 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -109,30 +109,23 @@ nsm_unmonitor(struct nlm_host *host)
 static struct rpc_clnt *
 nsm_create(void)
 {
-        struct rpc_xprt         *xprt;
+        struct sockaddr_in      sin = {
-        struct rpc_clnt         *clnt;
+                .sin_family     = AF_INET,
-        struct sockaddr_in      sin;
+                .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+                .sin_port       = 0,
-        sin.sin_family = AF_INET;
+        };
-        sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+        struct rpc_create_args args = {
-        sin.sin_port = 0;
+                .protocol       = IPPROTO_UDP,
+                .address        = (struct sockaddr *)&sin,
-        xprt = xprt_create_proto(IPPROTO_UDP, &sin, NULL);
+                .addrsize       = sizeof(sin),
-        if (IS_ERR(xprt))
+                .servername     = "localhost",
-                return (struct rpc_clnt *)xprt;
+                .program        = &nsm_program,
-        xprt->resvport = 1;     /* NSM requires a reserved port */
+                .version        = SM_VERSION,
+                .authflavor     = RPC_AUTH_NULL,
-        clnt = rpc_create_client(xprt, "localhost",
+                .flags          = (RPC_CLNT_CREATE_ONESHOT),
-                                &nsm_program, SM_VERSION,
+        };
-                                RPC_AUTH_NULL);
-        if (IS_ERR(clnt))
+        return rpc_create(&args);
-                goto out_err;
-        clnt->cl_softrtry = 1;
-        clnt->cl_oneshot  = 1;
-        return clnt;
-out_err:
-        return clnt;
 }
 /*
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 01b4db9e5466..a92dd98f8401 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -100,11 +100,10 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
        nlm_debug_print_fh("creating file for", f);
        nfserr = nlm_lck_denied_nolocks;
-        file = (struct nlm_file *) kmalloc(sizeof(*file), GFP_KERNEL);
+        file = kzalloc(sizeof(*file), GFP_KERNEL);
        if (!file)
                goto out_unlock;
-        memset(file, 0, sizeof(*file));
        memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
        file->f_hash = hash;
        init_MUTEX(&file->f_sema);
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e4fde1ab22cd..0ff71256e65b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -160,6 +160,7 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 static void
 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
+        __releases(mb_cache_spinlock)
 {
        /* Wake up all processes queuing for this cache entry. */
        if (ce->e_queued)
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 4a6abc49418e..df6b1075b549 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -254,7 +254,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
        inode->i_ino = j;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 330ff9fc7cf0..c11a4b9fb863 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -90,8 +90,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(minix_inode_cachep))
+        kmem_cache_destroy(minix_inode_cachep);
-                printk(KERN_INFO "minix_inode_cache: not all structures were freed\n");
 }
 static struct super_operations minix_sops = {
@@ -145,11 +144,10 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
        struct inode *root_inode;
        struct minix_sb_info *sbi;
-        sbi = kmalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        s->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(struct minix_sb_info));
        /* N.B. These should be compile-time tests.
           Unfortunately that is impossible. */
@@ -207,10 +205,9 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
        if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
                goto out_illegal_sb;
        i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
-        map = kmalloc(i, GFP_KERNEL);
+        map = kzalloc(i, GFP_KERNEL);
        if (!map)
                goto out_no_map;
-        memset(map, 0, i);
        sbi->s_imap = &map[0];
        sbi->s_zmap = &map[sbi->s_imap_blocks];
@@ -399,7 +396,7 @@ static void V1_minix_read_inode(struct inode * inode)
        inode->i_mtime.tv_nsec = 0;
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        for (i = 0; i < 9; i++)
                minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
        minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
@@ -432,7 +429,7 @@ static void V2_minix_read_inode(struct inode * inode)
        inode->i_mtime.tv_nsec = 0;
        inode->i_atime.tv_nsec = 0;
        inode->i_ctime.tv_nsec = 0;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        for (i = 0; i < 10; i++)
                minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
        minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 9e44158a7540..d220165d4918 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -280,7 +280,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
                        struct nameidata *nd)
 {
        struct super_block *sb = dir->i_sb;
-        struct inode *inode;
+        struct inode *inode = NULL;
        struct fat_slot_info sinfo;
        struct timespec ts;
        unsigned char msdos_name[MSDOS_NAME];
@@ -316,6 +316,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        d_instantiate(dentry, inode);
 out:
        unlock_kernel();
+        if (!err)
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -348,6 +350,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        fat_detach(inode);
 out:
        unlock_kernel();
+        if (!err)
+                err = fat_flush_inodes(inode->i_sb, dir, inode);
        return err;
 }
@@ -401,6 +405,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        unlock_kernel();
+        fat_flush_inodes(sb, dir, inode);
        return 0;
 out_free:
@@ -430,6 +435,8 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        fat_detach(inode);
 out:
        unlock_kernel();
+        if (!err)
+                err = fat_flush_inodes(inode->i_sb, dir, inode);
        return err;
 }
@@ -635,6 +642,8 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
                              new_dir, new_msdos_name, new_dentry, is_hid);
 out:
        unlock_kernel();
+        if (!err)
+                err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
        return err;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 432d6bc6fab0..2892e68d3a86 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -372,6 +372,30 @@ void release_open_intent(struct nameidata *nd)
                fput(nd->intent.open.file);
 }
+static inline struct dentry *
+do_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        int status = dentry->d_op->d_revalidate(dentry, nd);
+        if (unlikely(status <= 0)) {
+                /*
+                 * The dentry failed validation.
+                 * If d_revalidate returned 0 attempt to invalidate
+                 * the dentry otherwise d_revalidate is asking us
+                 * to return a fail status.
+                 */
+                if (!status) {
+                        if (!d_invalidate(dentry)) {
+                                dput(dentry);
+                                dentry = NULL;
+                        }
+                } else {
+                        dput(dentry);
+                        dentry = ERR_PTR(status);
+                }
+        }
+        return dentry;
+}
 /*
 * Internal lookup() using the new generic dcache.
 * SMP-safe
@@ -386,12 +410,9 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
        if (!dentry)
                dentry = d_lookup(parent, name);
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
-                if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) {
+                dentry = do_revalidate(dentry, nd);
-                        dput(dentry);
-                        dentry = NULL;
-                }
-        }
        return dentry;
 }
@@ -484,10 +505,9 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
         */
        mutex_unlock(&dir->i_mutex);
        if (result->d_op && result->d_op->d_revalidate) {
-                if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+                result = do_revalidate(result, nd);
-                        dput(result);
+                if (!result)
                        result = ERR_PTR(-ENOENT);
-                }
        }
        return result;
 }
@@ -498,18 +518,20 @@ static int __emul_lookup_dentry(const char *, struct nameidata *);
 static __always_inline int
 walk_init_root(const char *name, struct nameidata *nd)
 {
-        read_lock(&current->fs->lock);
+        struct fs_struct *fs = current->fs;
-        if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
-                nd->mnt = mntget(current->fs->altrootmnt);
+        read_lock(&fs->lock);
-                nd->dentry = dget(current->fs->altroot);
+        if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
-                read_unlock(&current->fs->lock);
+                nd->mnt = mntget(fs->altrootmnt);
+                nd->dentry = dget(fs->altroot);
+                read_unlock(&fs->lock);
                if (__emul_lookup_dentry(name,nd))
                        return 0;
-                read_lock(&current->fs->lock);
+                read_lock(&fs->lock);
        }
-        nd->mnt = mntget(current->fs->rootmnt);
+        nd->mnt = mntget(fs->rootmnt);
-        nd->dentry = dget(current->fs->root);
+        nd->dentry = dget(fs->root);
-        read_unlock(&current->fs->lock);
+        read_unlock(&fs->lock);
        return 1;
 }
@@ -704,17 +726,19 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
+        struct fs_struct *fs = current->fs;
        while(1) {
                struct vfsmount *parent;
                struct dentry *old = nd->dentry;
-                read_lock(&current->fs->lock);
+                read_lock(&fs->lock);
-                if (nd->dentry == current->fs->root &&
+                if (nd->dentry == fs->root &&
-                    nd->mnt == current->fs->rootmnt) {
+                    nd->mnt == fs->rootmnt) {
-                        read_unlock(&current->fs->lock);
+                        read_unlock(&fs->lock);
                        break;
                }
-                read_unlock(&current->fs->lock);
+                read_unlock(&fs->lock);
                spin_lock(&dcache_lock);
                if (nd->dentry != nd->mnt->mnt_root) {
                        nd->dentry = dget(nd->dentry->d_parent);
@@ -767,12 +791,12 @@ need_lookup:
        goto done;
 need_revalidate:
-        if (dentry->d_op->d_revalidate(dentry, nd))
+        dentry = do_revalidate(dentry, nd);
-                goto done;
+        if (!dentry)
-        if (d_invalidate(dentry))
+                goto need_lookup;
-                goto done;
+        if (IS_ERR(dentry))
-        dput(dentry);
+                goto fail;
-        goto need_lookup;
+        goto done;
 fail:
        return PTR_ERR(dentry);
@@ -1022,15 +1046,17 @@ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
                struct vfsmount *old_mnt = nd->mnt;
                struct qstr last = nd->last;
                int last_type = nd->last_type;
+                struct fs_struct *fs = current->fs;
                /*
-                 * NAME was not found in alternate root or it's a directory.  Try to find
+                 * NAME was not found in alternate root or it's a directory.
-                 * it in the normal root:
+                 * Try to find it in the normal root:
                 */
                nd->last_type = LAST_ROOT;
-                read_lock(&current->fs->lock);
+                read_lock(&fs->lock);
-                nd->mnt = mntget(current->fs->rootmnt);
+                nd->mnt = mntget(fs->rootmnt);
-                nd->dentry = dget(current->fs->root);
+                nd->dentry = dget(fs->root);
-                read_unlock(&current->fs->lock);
+                read_unlock(&fs->lock);
                if (path_walk(name, nd) == 0) {
                        if (nd->dentry->d_inode) {
                                dput(old_dentry);
@@ -1054,6 +1080,7 @@ void set_fs_altroot(void)
        struct vfsmount *mnt = NULL, *oldmnt;
        struct dentry *dentry = NULL, *olddentry;
        int err;
+        struct fs_struct *fs = current->fs;
        if (!emul)
                goto set_it;
@@ -1063,12 +1090,12 @@ void set_fs_altroot(void)
                dentry = nd.dentry;
        }
 set_it:
-        write_lock(&current->fs->lock);
+        write_lock(&fs->lock);
-        oldmnt = current->fs->altrootmnt;
+        oldmnt = fs->altrootmnt;
-        olddentry = current->fs->altroot;
+        olddentry = fs->altroot;
-        current->fs->altrootmnt = mnt;
+        fs->altrootmnt = mnt;
-        current->fs->altroot = dentry;
+        fs->altroot = dentry;
-        write_unlock(&current->fs->lock);
+        write_unlock(&fs->lock);
        if (olddentry) {
                dput(olddentry);
                mntput(oldmnt);
@@ -1082,29 +1109,30 @@ static int fastcall do_path_lookup(int dfd, const char *name,
        int retval = 0;
        int fput_needed;
        struct file *file;
+        struct fs_struct *fs = current->fs;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags;
        nd->depth = 0;
        if (*name=='/') {
-                read_lock(&current->fs->lock);
+                read_lock(&fs->lock);
-                if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
+                if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
-                        nd->mnt = mntget(current->fs->altrootmnt);
+                        nd->mnt = mntget(fs->altrootmnt);
-                        nd->dentry = dget(current->fs->altroot);
+                        nd->dentry = dget(fs->altroot);
-                        read_unlock(&current->fs->lock);
+                        read_unlock(&fs->lock);
                        if (__emul_lookup_dentry(name,nd))
                                goto out; /* found in altroot */
-                        read_lock(&current->fs->lock);
+                        read_lock(&fs->lock);
                }
-                nd->mnt = mntget(current->fs->rootmnt);
+                nd->mnt = mntget(fs->rootmnt);
-                nd->dentry = dget(current->fs->root);
+                nd->dentry = dget(fs->root);
-                read_unlock(&current->fs->lock);
+                read_unlock(&fs->lock);
        } else if (dfd == AT_FDCWD) {
-                read_lock(&current->fs->lock);
+                read_lock(&fs->lock);
-                nd->mnt = mntget(current->fs->pwdmnt);
+                nd->mnt = mntget(fs->pwdmnt);
-                nd->dentry = dget(current->fs->pwd);
+                nd->dentry = dget(fs->pwd);
-                read_unlock(&current->fs->lock);
+                read_unlock(&fs->lock);
        } else {
                struct dentry *dentry;
@@ -2370,7 +2398,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                dput(new_dentry);
        }
        if (!error)
-                d_move(old_dentry,new_dentry);
+                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+                        d_move(old_dentry,new_dentry);
        return error;
 }
@@ -2393,8 +2422,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
        else
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (!error) {
-                /* The following d_move() should become unconditional */
+                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-                if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
                        d_move(old_dentry, new_dentry);
        }
        if (target)
diff --git a/fs/namespace.c b/fs/namespace.c
index fa7ed6a9fc2d..6ede3a539ed8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,10 +13,12 @@
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/sysfs.h>
 #include <linux/seq_file.h>
 #include <linux/namespace.h>
 #include <linux/namei.h>
@@ -28,15 +30,6 @@
 extern int __init init_rootfs(void);
-#ifdef CONFIG_SYSFS
-extern int __init sysfs_init(void);
-#else
-static inline int sysfs_init(void)
-{
-        return 0;
-}
-#endif
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
@@ -1821,6 +1814,7 @@ void __init mnt_init(unsigned long mempages)
        struct list_head *d;
        unsigned int nr_hash;
        int i;
+        int err;
        init_rwsem(&namespace_sem);
@@ -1861,8 +1855,14 @@ void __init mnt_init(unsigned long mempages)
                d++;
                i--;
        } while (i);
-        sysfs_init();
+        err = sysfs_init();
-        subsystem_register(&fs_subsys);
+        if (err)
+                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
+                        __FUNCTION__, err);
+        err = subsystem_register(&fs_subsys);
+        if (err)
+                printk(KERN_WARNING "%s: subsystem_register error: %d\n",
+                        __FUNCTION__, err);
        init_rootfs();
        init_mount_tree();
 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 1ddf77b0b825..42e3bef270c9 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -81,8 +81,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(ncp_inode_cachep))
+        kmem_cache_destroy(ncp_inode_cachep);
-                printk(KERN_INFO "ncp_inode_cache: not all structures were freed\n");
 }
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
@@ -224,7 +223,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
        inode->i_nlink = 1;
        inode->i_uid = server->m.uid;
        inode->i_gid = server->m.gid;
-        inode->i_blksize = NCP_BLOCK_SIZE;
        ncp_update_dates(inode, &nwinfo->i);
        ncp_update_inode(inode, nwinfo);
@@ -411,11 +409,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 #endif
        struct ncp_entry_info finfo;
-        server = kmalloc(sizeof(struct ncp_server), GFP_KERNEL);
+        server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
        if (!server)
                return -ENOMEM;
        sb->s_fs_info = server;
-        memset(server, 0, sizeof(struct ncp_server));
        error = -EFAULT;
        if (raw_data == NULL)
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index ca92c2406635..e3d26c1bd105 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -48,7 +48,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page)
        char *buf = kmap(page);
        error = -ENOMEM;
-        rawlink=(char *)kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+        rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
        if (!rawlink)
                goto fail;
@@ -126,7 +126,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
        /* EPERM is returned by VFS if symlink procedure does not exist */
                return -EPERM;
  
-        rawlink=(char *)kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+        rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
        if (!rawlink)
                return -ENOMEM;
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 0b572a0c1967..f4580b44eef4 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,9 +4,9 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
-nfs-y                   := dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
+nfs-y                   := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
-                           proc.o read.o symlink.o unlink.o write.o \
+                           pagelist.o proc.o read.o symlink.o unlink.o \
-                           namespace.o
+                           write.o namespace.o
 nfs-$(CONFIG_ROOT_NFS)  += nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)    += nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)        += nfs3acl.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index fe0a6b8ac149..a3ee11364db0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -19,6 +19,7 @@
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -36,6 +37,21 @@ static struct svc_program nfs4_callback_program;
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+        char *endp;
+        int num = simple_strtol(val, &endp, 0);
+        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+                return -EINVAL;
+        *((int *)kp->arg) = num;
+        return 0;
+}
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+                 &nfs_callback_set_tcpport, 0644);
 /*
 * This is the callback kernel thread.
@@ -134,10 +150,8 @@ out_err:
 /*
 * Kill the server process if it is not already up.
 */
-int nfs_callback_down(void)
+void nfs_callback_down(void)
 {
-        int ret = 0;
        lock_kernel();
        mutex_lock(&nfs_callback_mutex);
        nfs_callback_info.users--;
@@ -149,20 +163,19 @@ int nfs_callback_down(void)
        } while (wait_for_completion_timeout(&nfs_callback_info.stopped, 5*HZ) == 0);
        mutex_unlock(&nfs_callback_mutex);
        unlock_kernel();
-        return ret;
 }
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-        struct in_addr *addr = &rqstp->rq_addr.sin_addr;
+        struct sockaddr_in *addr = &rqstp->rq_addr;
-        struct nfs4_client *clp;
+        struct nfs_client *clp;
        /* Don't talk to strangers */
-        clp = nfs4_find_client(addr);
+        clp = nfs_find_client(addr, 4);
        if (clp == NULL)
                return SVC_DROP;
-        dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr));
+        dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr->sin_addr));
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
        switch (rqstp->rq_authop->flavour) {
                case RPC_AUTH_NULL:
                        if (rqstp->rq_proc != CB_NULL)
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b252e7fe53a5..5676163d26e8 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -62,8 +62,13 @@ struct cb_recallargs {
 extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
 extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+#ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(void);
-extern int nfs_callback_down(void);
+extern void nfs_callback_down(void);
+#else
+#define nfs_callback_up()       (0)
+#define nfs_callback_down()     do {} while(0)
+#endif
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 7719483ecdfc..97cf8f71451f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -10,19 +10,20 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
+#include "internal.h"
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
 unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
 {
-        struct nfs4_client *clp;
+        struct nfs_client *clp;
        struct nfs_delegation *delegation;
        struct nfs_inode *nfsi;
        struct inode *inode;
        
        res->bitmap[0] = res->bitmap[1] = 0;
        res->status = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs4_find_client(&args->addr->sin_addr);
+        clp = nfs_find_client(args->addr, 4);
        if (clp == NULL)
                goto out;
        inode = nfs_delegation_find_inode(clp, &args->fh);
@@ -48,7 +49,7 @@ out_iput:
        up_read(&nfsi->rwsem);
        iput(inode);
 out_putclient:
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status));
        return res->status;
@@ -56,12 +57,12 @@ out:
 unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
-        struct nfs4_client *clp;
+        struct nfs_client *clp;
        struct inode *inode;
        unsigned res;
        
        res = htonl(NFS4ERR_BADHANDLE);
-        clp = nfs4_find_client(&args->addr->sin_addr);
+        clp = nfs_find_client(args->addr, 4);
        if (clp == NULL)
                goto out;
        inode = nfs_delegation_find_inode(clp, &args->fh);
@@ -80,7 +81,7 @@ unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
        }
        iput(inode);
 out_putclient:
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
 out:
        dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
        return res;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
new file mode 100644
index 000000000000..ec1938d4b814
--- /dev/null
+++ b/fs/nfs/client.c
@@ -0,0 +1,1448 @@
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+#include <asm/system.h>
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_CLIENT
+static DEFINE_SPINLOCK(nfs_client_lock);
+static LIST_HEAD(nfs_client_list);
+static LIST_HEAD(nfs_volume_list);
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version *nfs_version[5] = {
+        [2]                     = &nfs_version2,
+#ifdef CONFIG_NFS_V3
+        [3]                     = &nfs_version3,
+#endif
+#ifdef CONFIG_NFS_V4
+        [4]                     = &nfs_version4,
+#endif
+};
+struct rpc_program nfs_program = {
+        .name                   = "nfs",
+        .number                 = NFS_PROGRAM,
+        .nrvers                 = ARRAY_SIZE(nfs_version),
+        .version                = nfs_version,
+        .stats                  = &nfs_rpcstat,
+        .pipe_dir_name          = "/nfs",
+};
+struct rpc_stat nfs_rpcstat = {
+        .program                = &nfs_program
+};
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *     nfsacl_version[] = {
+        [3]                     = &nfsacl_version3,
+};
+struct rpc_program              nfsacl_program = {
+        .name                   = "nfsacl",
+        .number                 = NFS_ACL_PROGRAM,
+        .nrvers                 = ARRAY_SIZE(nfsacl_version),
+        .version                = nfsacl_version,
+        .stats                  = &nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+static struct nfs_client *nfs_alloc_client(const char *hostname,
+                                           const struct sockaddr_in *addr,
+                                           int nfsversion)
+{
+        struct nfs_client *clp;
+        int error;
+        if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+                goto error_0;
+        error = rpciod_up();
+        if (error < 0) {
+                dprintk("%s: couldn't start rpciod! Error = %d\n",
+                                __FUNCTION__, error);
+                goto error_1;
+        }
+        __set_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
+        if (nfsversion == 4) {
+                if (nfs_callback_up() < 0)
+                        goto error_2;
+                __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+        }
+        atomic_set(&clp->cl_count, 1);
+        clp->cl_cons_state = NFS_CS_INITING;
+        clp->cl_nfsversion = nfsversion;
+        memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+        if (hostname) {
+                clp->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+                if (!clp->cl_hostname)
+                        goto error_3;
+        }
+        INIT_LIST_HEAD(&clp->cl_superblocks);
+        clp->cl_rpcclient = ERR_PTR(-EINVAL);
+#ifdef CONFIG_NFS_V4
+        init_rwsem(&clp->cl_sem);
+        INIT_LIST_HEAD(&clp->cl_delegations);
+        INIT_LIST_HEAD(&clp->cl_state_owners);
+        INIT_LIST_HEAD(&clp->cl_unused);
+        spin_lock_init(&clp->cl_lock);
+        INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
+        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+        clp->cl_boot_time = CURRENT_TIME;
+        clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+#endif
+        return clp;
+error_3:
+        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+                nfs_callback_down();
+error_2:
+        rpciod_down();
+        __clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
+error_1:
+        kfree(clp);
+error_0:
+        return NULL;
+}
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4
+        if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+                nfs4_kill_renewd(clp);
+        while (!list_empty(&clp->cl_unused)) {
+                struct nfs4_state_owner *sp;
+                sp = list_entry(clp->cl_unused.next,
+                                struct nfs4_state_owner,
+                                so_list);
+                list_del(&sp->so_list);
+                kfree(sp);
+        }
+        BUG_ON(!list_empty(&clp->cl_state_owners));
+        if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+                nfs_idmap_delete(clp);
+#endif
+}
+/*
+ * Destroy a shared client record
+ */
+static void nfs_free_client(struct nfs_client *clp)
+{
+        dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion);
+        nfs4_shutdown_client(clp);
+        /* -EIO all pending I/O */
+        if (!IS_ERR(clp->cl_rpcclient))
+                rpc_shutdown_client(clp->cl_rpcclient);
+        if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+                nfs_callback_down();
+        if (__test_and_clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state))
+                rpciod_down();
+        kfree(clp->cl_hostname);
+        kfree(clp);
+        dprintk("<-- nfs_free_client()\n");
+}
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+        if (!clp)
+                return;
+        dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+        if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
+                list_del(&clp->cl_share_link);
+                spin_unlock(&nfs_client_lock);
+                BUG_ON(!list_empty(&clp->cl_superblocks));
+                nfs_free_client(clp);
+        }
+}
+/*
+ * Find a client by address
+ * - caller must hold nfs_client_lock
+ */
+static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+{
+        struct nfs_client *clp;
+        list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+                /* Different NFS versions cannot share the same nfs_client */
+                if (clp->cl_nfsversion != nfsversion)
+                        continue;
+                if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr,
+                           sizeof(clp->cl_addr.sin_addr)) != 0)
+                        continue;
+                if (clp->cl_addr.sin_port == addr->sin_port)
+                        goto found;
+        }
+        return NULL;
+found:
+        atomic_inc(&clp->cl_count);
+        return clp;
+}
+/*
+ * Find a client by IP address and protocol version
+ * - returns NULL if no such client
+ */
+struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+{
+        struct nfs_client *clp;
+        spin_lock(&nfs_client_lock);
+        clp = __nfs_find_client(addr, nfsversion);
+        spin_unlock(&nfs_client_lock);
+        BUG_ON(clp && clp->cl_cons_state == 0);
+        return clp;
+}
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+static struct nfs_client *nfs_get_client(const char *hostname,
+                                         const struct sockaddr_in *addr,
+                                         int nfsversion)
+{
+        struct nfs_client *clp, *new = NULL;
+        int error;
+        dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n",
+                hostname ?: "", NIPQUAD(addr->sin_addr),
+                addr->sin_port, nfsversion);
+        /* see if the client already exists */
+        do {
+                spin_lock(&nfs_client_lock);
+                clp = __nfs_find_client(addr, nfsversion);
+                if (clp)
+                        goto found_client;
+                if (new)
+                        goto install_client;
+                spin_unlock(&nfs_client_lock);
+                new = nfs_alloc_client(hostname, addr, nfsversion);
+        } while (new);
+        return ERR_PTR(-ENOMEM);
+        /* install a new client and return with it unready */
+install_client:
+        clp = new;
+        list_add(&clp->cl_share_link, &nfs_client_list);
+        spin_unlock(&nfs_client_lock);
+        dprintk("--> nfs_get_client() = %p [new]\n", clp);
+        return clp;
+        /* found an existing client
+         * - make sure it's ready before returning
+         */
+found_client:
+        spin_unlock(&nfs_client_lock);
+        if (new)
+                nfs_free_client(new);
+        if (clp->cl_cons_state == NFS_CS_INITING) {
+                DECLARE_WAITQUEUE(myself, current);
+                add_wait_queue(&nfs_client_active_wq, &myself);
+                for (;;) {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        if (signal_pending(current) ||
+                            clp->cl_cons_state > NFS_CS_READY)
+                                break;
+                        schedule();
+                }
+                remove_wait_queue(&nfs_client_active_wq, &myself);
+                if (signal_pending(current)) {
+                        nfs_put_client(clp);
+                        return ERR_PTR(-ERESTARTSYS);
+                }
+        }
+        if (clp->cl_cons_state < NFS_CS_READY) {
+                error = clp->cl_cons_state;
+                nfs_put_client(clp);
+                return ERR_PTR(error);
+        }
+        BUG_ON(clp->cl_cons_state != NFS_CS_READY);
+        dprintk("--> nfs_get_client() = %p [share]\n", clp);
+        return clp;
+}
+/*
+ * Mark a server as ready or failed
+ */
+static void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+        clp->cl_cons_state = state;
+        wake_up_all(&nfs_client_active_wq);
+}
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+                                    unsigned int timeo, unsigned int retrans)
+{
+        to->to_initval = timeo * HZ / 10;
+        to->to_retries = retrans;
+        if (!to->to_retries)
+                to->to_retries = 2;
+        switch (proto) {
+        case IPPROTO_TCP:
+                if (!to->to_initval)
+                        to->to_initval = 60 * HZ;
+                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
+                to->to_increment = to->to_initval;
+                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+                to->to_exponential = 0;
+                break;
+        case IPPROTO_UDP:
+        default:
+                if (!to->to_initval)
+                        to->to_initval = 11 * HZ / 10;
+                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
+                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+                to->to_exponential = 1;
+                break;
+        }
+}
+/*
+ * Create an RPC client handle
+ */
+static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
+                                                unsigned int timeo,
+                                                unsigned int retrans,
+                                                rpc_authflavor_t flavor)
+{
+        struct rpc_timeout      timeparms;
+        struct rpc_clnt         *clnt = NULL;
+        struct rpc_create_args args = {
+                .protocol       = proto,
+                .address        = (struct sockaddr *)&clp->cl_addr,
+                .addrsize       = sizeof(clp->cl_addr),
+                .timeout        = &timeparms,
+                .servername     = clp->cl_hostname,
+                .program        = &nfs_program,
+                .version        = clp->rpc_ops->version,
+                .authflavor     = flavor,
+        };
+        if (!IS_ERR(clp->cl_rpcclient))
+                return 0;
+        nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+        clp->retrans_timeo = timeparms.to_initval;
+        clp->retrans_count = timeparms.to_retries;
+        clnt = rpc_create(&args);
+        if (IS_ERR(clnt)) {
+                dprintk("%s: cannot create RPC client. Error = %ld\n",
+                                __FUNCTION__, PTR_ERR(clnt));
+                return PTR_ERR(clnt);
+        }
+        clp->cl_rpcclient = clnt;
+        return 0;
+}
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+        if (!IS_ERR(server->client_acl))
+                rpc_shutdown_client(server->client_acl);
+        if (!(server->flags & NFS_MOUNT_NONLM))
+                lockd_down();   /* release rpc.lockd */
+}
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+        int error = 0;
+        if (server->nfs_client->cl_nfsversion > 3)
+                goto out;
+        if (server->flags & NFS_MOUNT_NONLM)
+                goto out;
+        error = lockd_up();
+        if (error < 0)
+                server->flags |= NFS_MOUNT_NONLM;
+        else
+                server->destroy = nfs_destroy_server;
+out:
+        return error;
+}
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+#ifdef CONFIG_NFS_V3_ACL
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+        if (server->nfs_client->cl_nfsversion != 3)
+                goto out_noacl;
+        if (server->flags & NFS_MOUNT_NOACL)
+                goto out_noacl;
+        server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+        if (IS_ERR(server->client_acl))
+                goto out_noacl;
+        /* No errors! Assume that Sun nfsacls are supported */
+        server->caps |= NFS_CAP_ACLS;
+        return;
+out_noacl:
+        server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+        server->flags &= ~NFS_MOUNT_NOACL;
+        server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+/*
+ * Create a general RPC client
+ */
+static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour)
+{
+        struct nfs_client *clp = server->nfs_client;
+        server->client = rpc_clone_client(clp->cl_rpcclient);
+        if (IS_ERR(server->client)) {
+                dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__);
+                return PTR_ERR(server->client);
+        }
+        if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
+                struct rpc_auth *auth;
+                auth = rpcauth_create(pseudoflavour, server->client);
+                if (IS_ERR(auth)) {
+                        dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+                        return PTR_ERR(auth);
+                }
+        }
+        server->client->cl_softrtry = 0;
+        if (server->flags & NFS_MOUNT_SOFT)
+                server->client->cl_softrtry = 1;
+        server->client->cl_intr = 0;
+        if (server->flags & NFS4_MOUNT_INTR)
+                server->client->cl_intr = 1;
+        return 0;
+}
+/*
+ * Initialise an NFS2 or NFS3 client
+ */
+static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data)
+{
+        int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+        int error;
+        if (clp->cl_cons_state == NFS_CS_READY) {
+                /* the client is already initialised */
+                dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+                return 0;
+        }
+        /* Check NFS protocol revision and initialize RPC op vector */
+        clp->rpc_ops = &nfs_v2_clientops;
+#ifdef CONFIG_NFS_V3
+        if (clp->cl_nfsversion == 3)
+                clp->rpc_ops = &nfs_v3_clientops;
+#endif
+        /*
+         * Create a client RPC handle for doing FSSTAT with UNIX auth only
+         * - RFC 2623, sec 2.3.2
+         */
+        error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans,
+                        RPC_AUTH_UNIX);
+        if (error < 0)
+                goto error;
+        nfs_mark_client_ready(clp, NFS_CS_READY);
+        return 0;
+error:
+        nfs_mark_client_ready(clp, error);
+        dprintk("<-- nfs_init_client() = xerror %d\n", error);
+        return error;
+}
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data)
+{
+        struct nfs_client *clp;
+        int error, nfsvers = 2;
+        dprintk("--> nfs_init_server()\n");
+#ifdef CONFIG_NFS_V3
+        if (data->flags & NFS_MOUNT_VER3)
+                nfsvers = 3;
+#endif
+        /* Allocate or find a client reference we can use */
+        clp = nfs_get_client(data->hostname, &data->addr, nfsvers);
+        if (IS_ERR(clp)) {
+                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+                return PTR_ERR(clp);
+        }
+        error = nfs_init_client(clp, data);
+        if (error < 0)
+                goto error;
+        server->nfs_client = clp;
+        /* Initialise the client representation from the mount data */
+        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+        if (data->rsize)
+                server->rsize = nfs_block_size(data->rsize, NULL);
+        if (data->wsize)
+                server->wsize = nfs_block_size(data->wsize, NULL);
+        server->acregmin = data->acregmin * HZ;
+        server->acregmax = data->acregmax * HZ;
+        server->acdirmin = data->acdirmin * HZ;
+        server->acdirmax = data->acdirmax * HZ;
+        /* Start lockd here, before we might error out */
+        error = nfs_start_lockd(server);
+        if (error < 0)
+                goto error;
+        error = nfs_init_server_rpcclient(server, data->pseudoflavor);
+        if (error < 0)
+                goto error;
+        server->namelen  = data->namlen;
+        /* Create a client RPC handle for the NFSv3 ACL management interface */
+        nfs_init_server_aclclient(server);
+        if (clp->cl_nfsversion == 3) {
+                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+                        server->namelen = NFS3_MAXNAMLEN;
+                server->caps |= NFS_CAP_READDIRPLUS;
+        } else {
+                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+                        server->namelen = NFS2_MAXNAMLEN;
+        }
+        dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
+        return 0;
+error:
+        server->nfs_client = NULL;
+        nfs_put_client(clp);
+        dprintk("<-- nfs_init_server() = xerror %d\n", error);
+        return error;
+}
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+{
+        unsigned long max_rpc_payload;
+        /* Work out a lot of parameters */
+        if (server->rsize == 0)
+                server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+        if (server->wsize == 0)
+                server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+        if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+                server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+        if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+                server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+        max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+        if (server->rsize > max_rpc_payload)
+                server->rsize = max_rpc_payload;
+        if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+                server->rsize = NFS_MAX_FILE_IO_SIZE;
+        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+        if (server->wsize > max_rpc_payload)
+                server->wsize = max_rpc_payload;
+        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+                server->wsize = NFS_MAX_FILE_IO_SIZE;
+        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+        server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+        if (server->dtsize > PAGE_CACHE_SIZE)
+                server->dtsize = PAGE_CACHE_SIZE;
+        if (server->dtsize > server->rsize)
+                server->dtsize = server->rsize;
+        if (server->flags & NFS_MOUNT_NOAC) {
+                server->acregmin = server->acregmax = 0;
+                server->acdirmin = server->acdirmax = 0;
+        }
+        server->maxfilesize = fsinfo->maxfilesize;
+        /* We're airborne Set socket buffersize */
+        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+}
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+        struct nfs_fsinfo fsinfo;
+        struct nfs_client *clp = server->nfs_client;
+        int error;
+        dprintk("--> nfs_probe_fsinfo()\n");
+        if (clp->rpc_ops->set_capabilities != NULL) {
+                error = clp->rpc_ops->set_capabilities(server, mntfh);
+                if (error < 0)
+                        goto out_error;
+        }
+        fsinfo.fattr = fattr;
+        nfs_fattr_init(fattr);
+        error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+        if (error < 0)
+                goto out_error;
+        nfs_server_set_fsinfo(server, &fsinfo);
+        /* Get some general file system info */
+        if (server->namelen == 0) {
+                struct nfs_pathconf pathinfo;
+                pathinfo.fattr = fattr;
+                nfs_fattr_init(fattr);
+                if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+                        server->namelen = pathinfo.max_namelen;
+        }
+        dprintk("<-- nfs_probe_fsinfo() = 0\n");
+        return 0;
+out_error:
+        dprintk("nfs_probe_fsinfo: error = %d\n", -error);
+        return error;
+}
+/*
+ * Copy useful information when duplicating a server record
+ */
+static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+        target->flags = source->flags;
+        target->acregmin = source->acregmin;
+        target->acregmax = source->acregmax;
+        target->acdirmin = source->acdirmin;
+        target->acdirmax = source->acdirmax;
+        target->caps = source->caps;
+}
+/*
+ * Allocate and initialise a server record
+ */
+static struct nfs_server *nfs_alloc_server(void)
+{
+        struct nfs_server *server;
+        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        if (!server)
+                return NULL;
+        server->client = server->client_acl = ERR_PTR(-EINVAL);
+        /* Zero out the NFS state stuff */
+        INIT_LIST_HEAD(&server->client_link);
+        INIT_LIST_HEAD(&server->master_link);
+        server->io_stats = nfs_alloc_iostats();
+        if (!server->io_stats) {
+                kfree(server);
+                return NULL;
+        }
+        return server;
+}
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+        dprintk("--> nfs_free_server()\n");
+        spin_lock(&nfs_client_lock);
+        list_del(&server->client_link);
+        list_del(&server->master_link);
+        spin_unlock(&nfs_client_lock);
+        if (server->destroy != NULL)
+                server->destroy(server);
+        if (!IS_ERR(server->client))
+                rpc_shutdown_client(server->client);
+        nfs_put_client(server->nfs_client);
+        nfs_free_iostats(server->io_stats);
+        kfree(server);
+        nfs_release_automount_timer();
+        dprintk("<-- nfs_free_server()\n");
+}
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
+                                     struct nfs_fh *mntfh)
+{
+        struct nfs_server *server;
+        struct nfs_fattr fattr;
+        int error;
+        server = nfs_alloc_server();
+        if (!server)
+                return ERR_PTR(-ENOMEM);
+        /* Get a client representation */
+        error = nfs_init_server(server, data);
+        if (error < 0)
+                goto error;
+        BUG_ON(!server->nfs_client);
+        BUG_ON(!server->nfs_client->rpc_ops);
+        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        /* Probe the root fh to retrieve its FSID */
+        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        if (error < 0)
+                goto error;
+        if (!(fattr.valid & NFS_ATTR_FATTR)) {
+                error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+                if (error < 0) {
+                        dprintk("nfs_create_server: getattr error = %d\n", -error);
+                        goto error;
+                }
+        }
+        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+        dprintk("Server FSID: %llx:%llx\n",
+                (unsigned long long) server->fsid.major,
+                (unsigned long long) server->fsid.minor);
+        BUG_ON(!server->nfs_client);
+        BUG_ON(!server->nfs_client->rpc_ops);
+        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        spin_lock(&nfs_client_lock);
+        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+        server->mount_time = jiffies;
+        return server;
+error:
+        nfs_free_server(server);
+        return ERR_PTR(error);
+}
+#ifdef CONFIG_NFS_V4
+/*
+ * Initialise an NFS4 client record
+ */
+static int nfs4_init_client(struct nfs_client *clp,
+                int proto, int timeo, int retrans,
+                rpc_authflavor_t authflavour)
+{
+        int error;
+        if (clp->cl_cons_state == NFS_CS_READY) {
+                /* the client is initialised already */
+                dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
+                return 0;
+        }
+        /* Check NFS protocol revision and initialize RPC op vector */
+        clp->rpc_ops = &nfs_v4_clientops;
+        error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour);
+        if (error < 0)
+                goto error;
+        error = nfs_idmap_new(clp);
+        if (error < 0) {
+                dprintk("%s: failed to create idmapper. Error = %d\n",
+                        __FUNCTION__, error);
+                goto error;
+        }
+        __set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+        nfs_mark_client_ready(clp, NFS_CS_READY);
+        return 0;
+error:
+        nfs_mark_client_ready(clp, error);
+        dprintk("<-- nfs4_init_client() = xerror %d\n", error);
+        return error;
+}
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+                const char *hostname, const struct sockaddr_in *addr,
+                rpc_authflavor_t authflavour,
+                int proto, int timeo, int retrans)
+{
+        struct nfs_client *clp;
+        int error;
+        dprintk("--> nfs4_set_client()\n");
+        /* Allocate or find a client reference we can use */
+        clp = nfs_get_client(hostname, addr, 4);
+        if (IS_ERR(clp)) {
+                error = PTR_ERR(clp);
+                goto error;
+        }
+        error = nfs4_init_client(clp, proto, timeo, retrans, authflavour);
+        if (error < 0)
+                goto error_put;
+        server->nfs_client = clp;
+        dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
+        return 0;
+error_put:
+        nfs_put_client(clp);
+error:
+        dprintk("<-- nfs4_set_client() = xerror %d\n", error);
+        return error;
+}
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server,
+                const struct nfs4_mount_data *data, rpc_authflavor_t authflavour)
+{
+        int error;
+        dprintk("--> nfs4_init_server()\n");
+        /* Initialise the client representation from the mount data */
+        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
+        if (data->rsize)
+                server->rsize = nfs_block_size(data->rsize, NULL);
+        if (data->wsize)
+                server->wsize = nfs_block_size(data->wsize, NULL);
+        server->acregmin = data->acregmin * HZ;
+        server->acregmax = data->acregmax * HZ;
+        server->acdirmin = data->acdirmin * HZ;
+        server->acdirmax = data->acdirmax * HZ;
+        error = nfs_init_server_rpcclient(server, authflavour);
+        /* Done */
+        dprintk("<-- nfs4_init_server() = %d\n", error);
+        return error;
+}
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
+                                      const char *hostname,
+                                      const struct sockaddr_in *addr,
+                                      const char *mntpath,
+                                      const char *ip_addr,
+                                      rpc_authflavor_t authflavour,
+                                      struct nfs_fh *mntfh)
+{
+        struct nfs_fattr fattr;
+        struct nfs_server *server;
+        int error;
+        dprintk("--> nfs4_create_server()\n");
+        server = nfs_alloc_server();
+        if (!server)
+                return ERR_PTR(-ENOMEM);
+        /* Get a client record */
+        error = nfs4_set_client(server, hostname, addr, authflavour,
+                        data->proto, data->timeo, data->retrans);
+        if (error < 0)
+                goto error;
+        /* set up the general RPC client */
+        error = nfs4_init_server(server, data, authflavour);
+        if (error < 0)
+                goto error;
+        BUG_ON(!server->nfs_client);
+        BUG_ON(!server->nfs_client->rpc_ops);
+        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        /* Probe the root fh to retrieve its FSID */
+        error = nfs4_path_walk(server, mntfh, mntpath);
+        if (error < 0)
+                goto error;
+        dprintk("Server FSID: %llx:%llx\n",
+                (unsigned long long) server->fsid.major,
+                (unsigned long long) server->fsid.minor);
+        dprintk("Mount FH: %d\n", mntfh->size);
+        error = nfs_probe_fsinfo(server, mntfh, &fattr);
+        if (error < 0)
+                goto error;
+        BUG_ON(!server->nfs_client);
+        BUG_ON(!server->nfs_client->rpc_ops);
+        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        spin_lock(&nfs_client_lock);
+        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+        server->mount_time = jiffies;
+        dprintk("<-- nfs4_create_server() = %p\n", server);
+        return server;
+error:
+        nfs_free_server(server);
+        dprintk("<-- nfs4_create_server() = error %d\n", error);
+        return ERR_PTR(error);
+}
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
+                                               struct nfs_fh *fh)
+{
+        struct nfs_client *parent_client;
+        struct nfs_server *server, *parent_server;
+        struct nfs_fattr fattr;
+        int error;
+        dprintk("--> nfs4_create_referral_server()\n");
+        server = nfs_alloc_server();
+        if (!server)
+                return ERR_PTR(-ENOMEM);
+        parent_server = NFS_SB(data->sb);
+        parent_client = parent_server->nfs_client;
+        /* Get a client representation.
+         * Note: NFSv4 always uses TCP, */
+        error = nfs4_set_client(server, data->hostname, data->addr,
+                        data->authflavor,
+                        parent_server->client->cl_xprt->prot,
+                        parent_client->retrans_timeo,
+                        parent_client->retrans_count);
+        if (error < 0)
+                goto error;
+        /* Initialise the client representation from the parent server */
+        nfs_server_copy_userdata(server, parent_server);
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
+        error = nfs_init_server_rpcclient(server, data->authflavor);
+        if (error < 0)
+                goto error;
+        BUG_ON(!server->nfs_client);
+        BUG_ON(!server->nfs_client->rpc_ops);
+        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        /* probe the filesystem info for this server filesystem */
+        error = nfs_probe_fsinfo(server, fh, &fattr);
+        if (error < 0)
+                goto error;
+        dprintk("Referral FSID: %llx:%llx\n",
+                (unsigned long long) server->fsid.major,
+                (unsigned long long) server->fsid.minor);
+        spin_lock(&nfs_client_lock);
+        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+        server->mount_time = jiffies;
+        dprintk("<-- nfs_create_referral_server() = %p\n", server);
+        return server;
+error:
+        nfs_free_server(server);
+        dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
+        return ERR_PTR(error);
+}
+#endif /* CONFIG_NFS_V4 */
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+                                    struct nfs_fh *fh,
+                                    struct nfs_fattr *fattr)
+{
+        struct nfs_server *server;
+        struct nfs_fattr fattr_fsinfo;
+        int error;
+        dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
+                (unsigned long long) fattr->fsid.major,
+                (unsigned long long) fattr->fsid.minor);
+        server = nfs_alloc_server();
+        if (!server)
+                return ERR_PTR(-ENOMEM);
+        /* Copy data from the source */
+        server->nfs_client = source->nfs_client;
+        atomic_inc(&server->nfs_client->cl_count);
+        nfs_server_copy_userdata(server, source);
+        server->fsid = fattr->fsid;
+        error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor);
+        if (error < 0)
+                goto out_free_server;
+        if (!IS_ERR(source->client_acl))
+                nfs_init_server_aclclient(server);
+        /* probe the filesystem info for this server filesystem */
+        error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+        if (error < 0)
+                goto out_free_server;
+        dprintk("Cloned FSID: %llx:%llx\n",
+                (unsigned long long) server->fsid.major,
+                (unsigned long long) server->fsid.minor);
+        error = nfs_start_lockd(server);
+        if (error < 0)
+                goto out_free_server;
+        spin_lock(&nfs_client_lock);
+        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+        list_add_tail(&server->master_link, &nfs_volume_list);
+        spin_unlock(&nfs_client_lock);
+        server->mount_time = jiffies;
+        dprintk("<-- nfs_clone_server() = %p\n", server);
+        return server;
+out_free_server:
+        nfs_free_server(server);
+        dprintk("<-- nfs_clone_server() = error %d\n", error);
+        return ERR_PTR(error);
+}
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *proc_fs_nfs;
+static int nfs_server_list_open(struct inode *inode, struct file *file);
+static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_server_list_stop(struct seq_file *p, void *v);
+static int nfs_server_list_show(struct seq_file *m, void *v);
+static struct seq_operations nfs_server_list_ops = {
+        .start  = nfs_server_list_start,
+        .next   = nfs_server_list_next,
+        .stop   = nfs_server_list_stop,
+        .show   = nfs_server_list_show,
+};
+static struct file_operations nfs_server_list_fops = {
+        .open           = nfs_server_list_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int nfs_volume_list_open(struct inode *inode, struct file *file);
+static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos);
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
+static void nfs_volume_list_stop(struct seq_file *p, void *v);
+static int nfs_volume_list_show(struct seq_file *m, void *v);
+static struct seq_operations nfs_volume_list_ops = {
+        .start  = nfs_volume_list_start,
+        .next   = nfs_volume_list_next,
+        .stop   = nfs_volume_list_stop,
+        .show   = nfs_volume_list_show,
+};
+static struct file_operations nfs_volume_list_fops = {
+        .open           = nfs_volume_list_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+/*
+ * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which
+ * we're dealing
+ */
+static int nfs_server_list_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *m;
+        int ret;
+        ret = seq_open(file, &nfs_server_list_ops);
+        if (ret < 0)
+                return ret;
+        m = file->private_data;
+        m->private = PDE(inode)->data;
+        return 0;
+}
+/*
+ * set up the iterator to start reading from the server list and return the first item
+ */
+static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
+{
+        struct list_head *_p;
+        loff_t pos = *_pos;
+        /* lock the list against modification */
+        spin_lock(&nfs_client_lock);
+        /* allow for the header line */
+        if (!pos)
+                return SEQ_START_TOKEN;
+        pos--;
+        /* find the n'th element in the list */
+        list_for_each(_p, &nfs_client_list)
+                if (!pos--)
+                        break;
+        return _p != &nfs_client_list ? _p : NULL;
+}
+/*
+ * move to next server
+ */
+static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+        struct list_head *_p;
+        (*pos)++;
+        _p = v;
+        _p = (v == SEQ_START_TOKEN) ? nfs_client_list.next : _p->next;
+        return _p != &nfs_client_list ? _p : NULL;
+}
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_server_list_stop(struct seq_file *p, void *v)
+{
+        spin_unlock(&nfs_client_lock);
+}
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_server_list_show(struct seq_file *m, void *v)
+{
+        struct nfs_client *clp;
+        /* display header on line 1 */
+        if (v == SEQ_START_TOKEN) {
+                seq_puts(m, "NV SERVER   PORT USE HOSTNAME\n");
+                return 0;
+        }
+        /* display one transport per line on subsequent lines */
+        clp = list_entry(v, struct nfs_client, cl_share_link);
+        seq_printf(m, "v%d %02x%02x%02x%02x %4hx %3d %s\n",
+                   clp->cl_nfsversion,
+                   NIPQUAD(clp->cl_addr.sin_addr),
+                   ntohs(clp->cl_addr.sin_port),
+                   atomic_read(&clp->cl_count),
+                   clp->cl_hostname);
+        return 0;
+}
+/*
+ * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes
+ */
+static int nfs_volume_list_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *m;
+        int ret;
+        ret = seq_open(file, &nfs_volume_list_ops);
+        if (ret < 0)
+                return ret;
+        m = file->private_data;
+        m->private = PDE(inode)->data;
+        return 0;
+}
+/*
+ * set up the iterator to start reading from the volume list and return the first item
+ */
+static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
+{
+        struct list_head *_p;
+        loff_t pos = *_pos;
+        /* lock the list against modification */
+        spin_lock(&nfs_client_lock);
+        /* allow for the header line */
+        if (!pos)
+                return SEQ_START_TOKEN;
+        pos--;
+        /* find the n'th element in the list */
+        list_for_each(_p, &nfs_volume_list)
+                if (!pos--)
+                        break;
+        return _p != &nfs_volume_list ? _p : NULL;
+}
+/*
+ * move to next volume
+ */
+static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
+{
+        struct list_head *_p;
+        (*pos)++;
+        _p = v;
+        _p = (v == SEQ_START_TOKEN) ? nfs_volume_list.next : _p->next;
+        return _p != &nfs_volume_list ? _p : NULL;
+}
+/*
+ * clean up after reading from the transports list
+ */
+static void nfs_volume_list_stop(struct seq_file *p, void *v)
+{
+        spin_unlock(&nfs_client_lock);
+}
+/*
+ * display a header line followed by a load of call lines
+ */
+static int nfs_volume_list_show(struct seq_file *m, void *v)
+{
+        struct nfs_server *server;
+        struct nfs_client *clp;
+        char dev[8], fsid[17];
+        /* display header on line 1 */
+        if (v == SEQ_START_TOKEN) {
+                seq_puts(m, "NV SERVER   PORT DEV     FSID\n");
+                return 0;
+        }
+        /* display one transport per line on subsequent lines */
+        server = list_entry(v, struct nfs_server, master_link);
+        clp = server->nfs_client;
+        snprintf(dev, 8, "%u:%u",
+                 MAJOR(server->s_dev), MINOR(server->s_dev));
+        snprintf(fsid, 17, "%llx:%llx",
+                 (unsigned long long) server->fsid.major,
+                 (unsigned long long) server->fsid.minor);
+        seq_printf(m, "v%d %02x%02x%02x%02x %4hx %-7s %-17s\n",
+                   clp->cl_nfsversion,
+                   NIPQUAD(clp->cl_addr.sin_addr),
+                   ntohs(clp->cl_addr.sin_port),
+                   dev,
+                   fsid);
+        return 0;
+}
+/*
+ * initialise the /proc/fs/nfsfs/ directory
+ */
+int __init nfs_fs_proc_init(void)
+{
+        struct proc_dir_entry *p;
+        proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs);
+        if (!proc_fs_nfs)
+                goto error_0;
+        proc_fs_nfs->owner = THIS_MODULE;
+        /* a file of servers with which we're dealing */
+        p = create_proc_entry("servers", S_IFREG|S_IRUGO, proc_fs_nfs);
+        if (!p)
+                goto error_1;
+        p->proc_fops = &nfs_server_list_fops;
+        p->owner = THIS_MODULE;
+        /* a file of volumes that we have mounted */
+        p = create_proc_entry("volumes", S_IFREG|S_IRUGO, proc_fs_nfs);
+        if (!p)
+                goto error_2;
+        p->proc_fops = &nfs_volume_list_fops;
+        p->owner = THIS_MODULE;
+        return 0;
+error_2:
+        remove_proc_entry("servers", proc_fs_nfs);
+error_1:
+        remove_proc_entry("nfsfs", proc_root_fs);
+error_0:
+        return -ENOMEM;
+}
+/*
+ * clean up the /proc/fs/nfsfs/ directory
+ */
+void nfs_fs_proc_exit(void)
+{
+        remove_proc_entry("volumes", proc_fs_nfs);
+        remove_proc_entry("servers", proc_fs_nfs);
+        remove_proc_entry("nfsfs", proc_root_fs);
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 9540a316c05e..841c99a9b11c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -18,11 +18,7 @@
 #include "nfs4_fs.h"
 #include "delegation.h"
+#include "internal.h"
-static struct nfs_delegation *nfs_alloc_delegation(void)
-{
-        return (struct nfs_delegation *)kmalloc(sizeof(struct nfs_delegation), GFP_KERNEL);
-}
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
@@ -52,7 +48,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
                        case -NFS4ERR_EXPIRED:
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                        case -NFS4ERR_STALE_CLIENTID:
-                                nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs4_state);
+                                nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client);
                                goto out_err;
                }
        }
@@ -114,7 +110,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
 */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int status = 0;
@@ -123,7 +119,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR)))
                __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-        delegation = nfs_alloc_delegation();
+        delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
        if (delegation == NULL)
                return -ENOMEM;
        memcpy(delegation->stateid.data, res->delegation.data,
@@ -145,7 +141,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                                        sizeof(delegation->stateid)) != 0 ||
                                delegation->type != nfsi->delegation->type) {
                        printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
-                                        __FUNCTION__, NIPQUAD(clp->cl_addr));
+                                        __FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr));
                        status = -EIO;
                }
        }
@@ -176,7 +172,7 @@ static void nfs_msync_inode(struct inode *inode)
 */
 int __nfs_inode_return_delegation(struct inode *inode)
 {
-        struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int res = 0;
@@ -208,7 +204,7 @@ int __nfs_inode_return_delegation(struct inode *inode)
 */
 void nfs_return_all_delegations(struct super_block *sb)
 {
-        struct nfs4_client *clp = NFS_SB(sb)->nfs4_state;
+        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
        struct nfs_delegation *delegation;
        struct inode *inode;
@@ -232,7 +228,7 @@ restart:
 int nfs_do_expire_all_delegations(void *ptr)
 {
-        struct nfs4_client *clp = ptr;
+        struct nfs_client *clp = ptr;
        struct nfs_delegation *delegation;
        struct inode *inode;
@@ -254,11 +250,11 @@ restart:
        }
 out:
        spin_unlock(&clp->cl_lock);
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
        module_put_and_exit(0);
 }
-void nfs_expire_all_delegations(struct nfs4_client *clp)
+void nfs_expire_all_delegations(struct nfs_client *clp)
 {
        struct task_struct *task;
@@ -266,17 +262,17 @@ void nfs_expire_all_delegations(struct nfs4_client *clp)
        atomic_inc(&clp->cl_count);
        task = kthread_run(nfs_do_expire_all_delegations, clp,
                        "%u.%u.%u.%u-delegreturn",
-                        NIPQUAD(clp->cl_addr));
+                        NIPQUAD(clp->cl_addr.sin_addr));
        if (!IS_ERR(task))
                return;
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
        module_put(THIS_MODULE);
 }
 /*
 * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
 */
-void nfs_handle_cb_pathdown(struct nfs4_client *clp)
+void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
        struct inode *inode;
@@ -299,7 +295,7 @@ restart:
 struct recall_threadargs {
        struct inode *inode;
-        struct nfs4_client *clp;
+        struct nfs_client *clp;
        const nfs4_stateid *stateid;
        struct completion started;
@@ -310,7 +306,7 @@ static int recall_thread(void *data)
 {
        struct recall_threadargs *args = (struct recall_threadargs *)data;
        struct inode *inode = igrab(args->inode);
-        struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
@@ -371,7 +367,7 @@ out_module_put:
 /*
 * Retrieve the inode associated with a delegation
 */
-struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle)
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
        struct nfs_delegation *delegation;
        struct inode *res = NULL;
@@ -389,7 +385,7 @@ struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nf
 /*
 * Mark all delegations as needing to be reclaimed
 */
-void nfs_delegation_mark_reclaim(struct nfs4_client *clp)
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
        spin_lock(&clp->cl_lock);
@@ -401,7 +397,7 @@ void nfs_delegation_mark_reclaim(struct nfs4_client *clp)
 /*
 * Reap all unclaimed delegations after reboot recovery is done
 */
-void nfs_delegation_reap_unclaimed(struct nfs4_client *clp)
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation, *n;
        LIST_HEAD(head);
@@ -423,7 +419,7 @@ void nfs_delegation_reap_unclaimed(struct nfs4_client *clp)
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
-        struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int res = 0;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 3858694652fa..2cfd4b24c7fe 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,13 +29,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
 int __nfs_inode_return_delegation(struct inode *inode);
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
-struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_return_all_delegations(struct super_block *sb);
-void nfs_expire_all_delegations(struct nfs4_client *clp);
+void nfs_expire_all_delegations(struct nfs_client *clp);
-void nfs_handle_cb_pathdown(struct nfs4_client *clp);
+void nfs_handle_cb_pathdown(struct nfs_client *clp);
-void nfs_delegation_mark_reclaim(struct nfs4_client *clp);
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
-void nfs_delegation_reap_unclaimed(struct nfs4_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 /* NFSv4 delegation-related procedures */
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e7ffb4deb3e5..7432f1a43f3d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -30,7 +30,9 @@
 #include <linux/nfs_mount.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
+#include <linux/pagevec.h>
 #include <linux/namei.h>
+#include <linux/mount.h>
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -870,14 +872,14 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
        return (nd->intent.open.flags & O_EXCL) != 0;
 }
-static inline int nfs_reval_fsid(struct inode *dir,
+static inline int nfs_reval_fsid(struct vfsmount *mnt, struct inode *dir,
-                struct nfs_fh *fh, struct nfs_fattr *fattr)
+                                 struct nfs_fh *fh, struct nfs_fattr *fattr)
 {
        struct nfs_server *server = NFS_SERVER(dir);
        if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
                /* Revalidate fsid on root dir */
-                return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+                return __nfs_revalidate_inode(server, mnt->mnt_root->d_inode);
        return 0;
 }
@@ -902,9 +904,15 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        lock_kernel();
-        /* If we're doing an exclusive create, optimize away the lookup */
+        /*
-        if (nfs_is_exclusive_create(dir, nd))
+         * If we're doing an exclusive create, optimize away the lookup
-                goto no_entry;
+         * but don't hash the dentry.
+         */
+        if (nfs_is_exclusive_create(dir, nd)) {
+                d_instantiate(dentry, NULL);
+                res = NULL;
+                goto out_unlock;
+        }
        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr);
        if (error == -ENOENT)
@@ -913,7 +921,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
                res = ERR_PTR(error);
                goto out_unlock;
        }
-        error = nfs_reval_fsid(dir, &fhandle, &fattr);
+        error = nfs_reval_fsid(nd->mnt, dir, &fhandle, &fattr);
        if (error < 0) {
                res = ERR_PTR(error);
                goto out_unlock;
@@ -922,8 +930,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        res = (struct dentry *)inode;
        if (IS_ERR(res))
                goto out_unlock;
 no_entry:
-        res = d_add_unique(dentry, inode);
+        res = d_materialise_unique(dentry, inode);
        if (res != NULL)
                dentry = res;
        nfs_renew_times(dentry);
@@ -1117,11 +1126,13 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
                dput(dentry);
                return NULL;
        }
-        alias = d_add_unique(dentry, inode);
+        alias = d_materialise_unique(dentry, inode);
        if (alias != NULL) {
                dput(dentry);
                dentry = alias;
        }
        nfs_renew_times(dentry);
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
        return dentry;
@@ -1143,23 +1154,22 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
                struct inode *dir = dentry->d_parent->d_inode;
                error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
                if (error)
-                        goto out_err;
+                        return error;
        }
        if (!(fattr->valid & NFS_ATTR_FATTR)) {
                struct nfs_server *server = NFS_SB(dentry->d_sb);
-                error = server->rpc_ops->getattr(server, fhandle, fattr);
+                error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
                if (error < 0)
-                        goto out_err;
+                        return error;
        }
        inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
        error = PTR_ERR(inode);
        if (IS_ERR(inode))
-                goto out_err;
+                return error;
        d_instantiate(dentry, inode);
+        if (d_unhashed(dentry))
+                d_rehash(dentry);
        return 0;
-out_err:
-        d_drop(dentry);
-        return error;
 }
 /*
@@ -1440,48 +1450,82 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
        return error;
 }
-static int
+/*
-nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+ * To create a symbolic link, most file systems instantiate a new inode,
+ * add a page to it containing the path, then write it out to the disk
+ * using prepare_write/commit_write.
+ *
+ * Unfortunately the NFS client can't create the in-core inode first
+ * because it needs a file handle to create an in-core inode (see
+ * fs/nfs/inode.c:nfs_fhget).  We only have a file handle *after* the
+ * symlink request has completed on the server.
+ *
+ * So instead we allocate a raw page, copy the symname into it, then do
+ * the SYMLINK request with the page as the buffer.  If it succeeds, we
+ * now have a new file handle and can instantiate an in-core NFS inode
+ * and move the raw page into its mapping.
+ */
+static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
+        struct pagevec lru_pvec;
+        struct page *page;
+        char *kaddr;
        struct iattr attr;
-        struct nfs_fattr sym_attr;
+        unsigned int pathlen = strlen(symname);
-        struct nfs_fh sym_fh;
-        struct qstr qsymname;
        int error;
        dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
                dir->i_ino, dentry->d_name.name, symname);
-#ifdef NFS_PARANOIA
+        if (pathlen > PAGE_SIZE)
-if (dentry->d_inode)
+                return -ENAMETOOLONG;
-printk("nfs_proc_symlink: %s/%s not negative!\n",
-dentry->d_parent->d_name.name, dentry->d_name.name);
-#endif
-        /*
-         * Fill in the sattr for the call.
-         * Note: SunOS 4.1.2 crashes if the mode isn't initialized!
-         */
-        attr.ia_valid = ATTR_MODE;
-        attr.ia_mode = S_IFLNK | S_IRWXUGO;
-        qsymname.name = symname;
+        attr.ia_mode = S_IFLNK | S_IRWXUGO;
-        qsymname.len  = strlen(symname);
+        attr.ia_valid = ATTR_MODE;
        lock_kernel();
+        page = alloc_page(GFP_KERNEL);
+        if (!page) {
+                unlock_kernel();
+                return -ENOMEM;
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        memcpy(kaddr, symname, pathlen);
+        if (pathlen < PAGE_SIZE)
+                memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
+        kunmap_atomic(kaddr, KM_USER0);
        nfs_begin_data_update(dir);
-        error = NFS_PROTO(dir)->symlink(dir, &dentry->d_name, &qsymname,
+        error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
-                                          &attr, &sym_fh, &sym_attr);
        nfs_end_data_update(dir);
-        if (!error) {
+        if (error != 0) {
-                error = nfs_instantiate(dentry, &sym_fh, &sym_attr);
+                dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
-        } else {
+                        dir->i_sb->s_id, dir->i_ino,
-                if (error == -EEXIST)
+                        dentry->d_name.name, symname, error);
-                        printk("nfs_proc_symlink: %s/%s already exists??\n",
-                               dentry->d_parent->d_name.name, dentry->d_name.name);
                d_drop(dentry);
+                __free_page(page);
+                unlock_kernel();
+                return error;
        }
+        /*
+         * No big deal if we can't add this page to the page cache here.
+         * READLINK will get the missing page from the server if needed.
+         */
+        pagevec_init(&lru_pvec, 0);
+        if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+                                                        GFP_KERNEL)) {
+                if (!pagevec_add(&lru_pvec, page))
+                        __pagevec_lru_add(&lru_pvec);
+                SetPageUptodate(page);
+                unlock_page(page);
+        } else
+                __free_page(page);
        unlock_kernel();
-        return error;
+        return 0;
 }
 static int 
@@ -1625,8 +1669,7 @@ out:
        if (rehash)
                d_rehash(rehash);
        if (!error) {
-                if (!S_ISDIR(old_inode->i_mode))
+                d_move(old_dentry, new_dentry);
-                        d_move(old_dentry, new_dentry);
                nfs_renew_times(new_dentry);
                nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir));
        }
@@ -1638,35 +1681,211 @@ out:
        return error;
 }
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+        put_rpccred(entry->cred);
+        kfree(entry);
+        smp_mb__before_atomic_dec();
+        atomic_long_dec(&nfs_access_nr_entries);
+        smp_mb__after_atomic_dec();
+}
+int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+{
+        LIST_HEAD(head);
+        struct nfs_inode *nfsi;
+        struct nfs_access_entry *cache;
+        spin_lock(&nfs_access_lru_lock);
+restart:
+        list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) {
+                struct inode *inode;
+                if (nr_to_scan-- == 0)
+                        break;
+                inode = igrab(&nfsi->vfs_inode);
+                if (inode == NULL)
+                        continue;
+                spin_lock(&inode->i_lock);
+                if (list_empty(&nfsi->access_cache_entry_lru))
+                        goto remove_lru_entry;
+                cache = list_entry(nfsi->access_cache_entry_lru.next,
+                                struct nfs_access_entry, lru);
+                list_move(&cache->lru, &head);
+                rb_erase(&cache->rb_node, &nfsi->access_cache);
+                if (!list_empty(&nfsi->access_cache_entry_lru))
+                        list_move_tail(&nfsi->access_cache_inode_lru,
+                                        &nfs_access_lru_list);
+                else {
+remove_lru_entry:
+                        list_del_init(&nfsi->access_cache_inode_lru);
+                        clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
+                }
+                spin_unlock(&inode->i_lock);
+                iput(inode);
+                goto restart;
+        }
+        spin_unlock(&nfs_access_lru_lock);
+        while (!list_empty(&head)) {
+                cache = list_entry(head.next, struct nfs_access_entry, lru);
+                list_del(&cache->lru);
+                nfs_access_free_entry(cache);
+        }
+        return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
+}
+static void __nfs_access_zap_cache(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct rb_root *root_node = &nfsi->access_cache;
+        struct rb_node *n, *dispose = NULL;
+        struct nfs_access_entry *entry;
+        /* Unhook entries from the cache */
+        while ((n = rb_first(root_node)) != NULL) {
+                entry = rb_entry(n, struct nfs_access_entry, rb_node);
+                rb_erase(n, root_node);
+                list_del(&entry->lru);
+                n->rb_left = dispose;
+                dispose = n;
+        }
+        nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+        spin_unlock(&inode->i_lock);
+        /* Now kill them all! */
+        while (dispose != NULL) {
+                n = dispose;
+                dispose = n->rb_left;
+                nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
+        }
+}
+void nfs_access_zap_cache(struct inode *inode)
+{
+        /* Remove from global LRU init */
+        if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+                spin_lock(&nfs_access_lru_lock);
+                list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+                spin_unlock(&nfs_access_lru_lock);
+        }
+        spin_lock(&inode->i_lock);
+        /* This will release the spinlock */
+        __nfs_access_zap_cache(inode);
+}
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+{
+        struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+        struct nfs_access_entry *entry;
+        while (n != NULL) {
+                entry = rb_entry(n, struct nfs_access_entry, rb_node);
+                if (cred < entry->cred)
+                        n = n->rb_left;
+                else if (cred > entry->cred)
+                        n = n->rb_right;
+                else
+                        return entry;
+        }
+        return NULL;
+}
 int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_access_entry *cache = &nfsi->cache_access;
+        struct nfs_access_entry *cache;
+        int err = -ENOENT;
-        if (cache->cred != cred
+        spin_lock(&inode->i_lock);
-                        || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
+        if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
-                        || (nfsi->cache_validity & NFS_INO_INVALID_ACCESS))
+                goto out_zap;
-                return -ENOENT;
+        cache = nfs_access_search_rbtree(inode, cred);
-        memcpy(res, cache, sizeof(*res));
+        if (cache == NULL)
-        return 0;
+                goto out;
+        if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
+                goto out_stale;
+        res->jiffies = cache->jiffies;
+        res->cred = cache->cred;
+        res->mask = cache->mask;
+        list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
+        err = 0;
+out:
+        spin_unlock(&inode->i_lock);
+        return err;
+out_stale:
+        rb_erase(&cache->rb_node, &nfsi->access_cache);
+        list_del(&cache->lru);
+        spin_unlock(&inode->i_lock);
+        nfs_access_free_entry(cache);
+        return -ENOENT;
+out_zap:
+        /* This will release the spinlock */
+        __nfs_access_zap_cache(inode);
+        return -ENOENT;
 }
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_access_entry *cache = &nfsi->cache_access;
+        struct rb_root *root_node = &nfsi->access_cache;
+        struct rb_node **p = &root_node->rb_node;
+        struct rb_node *parent = NULL;
+        struct nfs_access_entry *entry;
-        if (cache->cred != set->cred) {
-                if (cache->cred)
-                        put_rpccred(cache->cred);
-                cache->cred = get_rpccred(set->cred);
-        }
-        /* FIXME: replace current access_cache BKL reliance with inode->i_lock */
        spin_lock(&inode->i_lock);
-        nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+        while (*p != NULL) {
+                parent = *p;
+                entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+                if (set->cred < entry->cred)
+                        p = &parent->rb_left;
+                else if (set->cred > entry->cred)
+                        p = &parent->rb_right;
+                else
+                        goto found;
+        }
+        rb_link_node(&set->rb_node, parent, p);
+        rb_insert_color(&set->rb_node, root_node);
+        list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
        spin_unlock(&inode->i_lock);
+        return;
+found:
+        rb_replace_node(parent, &set->rb_node, root_node);
+        list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+        list_del(&entry->lru);
+        spin_unlock(&inode->i_lock);
+        nfs_access_free_entry(entry);
+}
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+        struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+        if (cache == NULL)
+                return;
+        RB_CLEAR_NODE(&cache->rb_node);
        cache->jiffies = set->jiffies;
+        cache->cred = get_rpccred(set->cred);
        cache->mask = set->mask;
+        nfs_access_add_rbtree(inode, cache);
+        /* Update accounting */
+        smp_mb__before_atomic_inc();
+        atomic_long_inc(&nfs_access_nr_entries);
+        smp_mb__after_atomic_inc();
+        /* Add inode to global LRU list */
+        if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+                spin_lock(&nfs_access_lru_lock);
+                list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+                spin_unlock(&nfs_access_lru_lock);
+        }
 }
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 76ca1cbc38f9..377839bed172 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -855,6 +855,5 @@ int __init nfs_init_directcache(void)
 */
 void nfs_destroy_directcache(void)
 {
-        if (kmem_cache_destroy(nfs_direct_cachep))
+        kmem_cache_destroy(nfs_direct_cachep);
-                printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 48e892880d5b..be997d649127 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -111,7 +111,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
        lock_kernel();
-        res = NFS_SERVER(inode)->rpc_ops->file_open(inode, filp);
+        res = NFS_PROTO(inode)->file_open(inode, filp);
        unlock_kernel();
        return res;
 }
@@ -157,7 +157,7 @@ force_reval:
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
        /* origin == SEEK_END => we must revalidate the cached file length */
-        if (origin == 2) {
+        if (origin == SEEK_END) {
                struct inode *inode = filp->f_mapping->host;
                int retval = nfs_revalidate_file_size(inode, filp);
                if (retval < 0)
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
new file mode 100644
index 000000000000..76b08ae9ed82
--- /dev/null
+++ b/fs/nfs/getroot.c
@@ -0,0 +1,311 @@
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/namespace.h>
+#include <linux/security.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+#define NFSDBG_FACILITY         NFSDBG_CLIENT
+#define NFS_PARANOIA 1
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_fsinfo fsinfo;
+        struct nfs_fattr fattr;
+        struct dentry *mntroot;
+        struct inode *inode;
+        int error;
+        /* create a dummy root dentry with dummy inode for this superblock */
+        if (!sb->s_root) {
+                struct nfs_fh dummyfh;
+                struct dentry *root;
+                struct inode *iroot;
+                memset(&dummyfh, 0, sizeof(dummyfh));
+                memset(&fattr, 0, sizeof(fattr));
+                nfs_fattr_init(&fattr);
+                fattr.valid = NFS_ATTR_FATTR;
+                fattr.type = NFDIR;
+                fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
+                fattr.nlink = 2;
+                iroot = nfs_fhget(sb, &dummyfh, &fattr);
+                if (IS_ERR(iroot))
+                        return ERR_PTR(PTR_ERR(iroot));
+                root = d_alloc_root(iroot);
+                if (!root) {
+                        iput(iroot);
+                        return ERR_PTR(-ENOMEM);
+                }
+                sb->s_root = root;
+        }
+        /* get the actual root for this mount */
+        fsinfo.fattr = &fattr;
+        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+        if (error < 0) {
+                dprintk("nfs_get_root: getattr error = %d\n", -error);
+                return ERR_PTR(error);
+        }
+        inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+        if (IS_ERR(inode)) {
+                dprintk("nfs_get_root: get root inode failed\n");
+                return ERR_PTR(PTR_ERR(inode));
+        }
+        /* root dentries normally start off anonymous and get spliced in later
+         * if the dentry tree reaches them; however if the dentry already
+         * exists, we'll pick it up at this point and use it as the root
+         */
+        mntroot = d_alloc_anon(inode);
+        if (!mntroot) {
+                iput(inode);
+                dprintk("nfs_get_root: get root dentry failed\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        security_d_instantiate(mntroot, inode);
+        if (!mntroot->d_op)
+                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+        return mntroot;
+}
+#ifdef CONFIG_NFS_V4
+/*
+ * Do a simple pathwalk from the root FH of the server to the nominated target
+ * of the mountpoint
+ * - give error on symlinks
+ * - give error on ".." occurring in the path
+ * - follow traversals
+ */
+int nfs4_path_walk(struct nfs_server *server,
+                   struct nfs_fh *mntfh,
+                   const char *path)
+{
+        struct nfs_fsinfo fsinfo;
+        struct nfs_fattr fattr;
+        struct nfs_fh lastfh;
+        struct qstr name;
+        int ret;
+        //int referral_count = 0;
+        dprintk("--> nfs4_path_walk(,,%s)\n", path);
+        fsinfo.fattr = &fattr;
+        nfs_fattr_init(&fattr);
+        if (*path++ != '/') {
+                dprintk("nfs4_get_root: Path does not begin with a slash\n");
+                return -EINVAL;
+        }
+        /* Start by getting the root filehandle from the server */
+        ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+        if (ret < 0) {
+                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+                return ret;
+        }
+        if (fattr.type != NFDIR) {
+                printk(KERN_ERR "nfs4_get_root:"
+                       " getroot encountered non-directory\n");
+                return -ENOTDIR;
+        }
+        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+                printk(KERN_ERR "nfs4_get_root:"
+                       " getroot obtained referral\n");
+                return -EREMOTE;
+        }
+next_component:
+        dprintk("Next: %s\n", path);
+        /* extract the next bit of the path */
+        if (!*path)
+                goto path_walk_complete;
+        name.name = path;
+        while (*path && *path != '/')
+                path++;
+        name.len = path - (const char *) name.name;
+eat_dot_dir:
+        while (*path == '/')
+                path++;
+        if (path[0] == '.' && (path[1] == '/' || !path[1])) {
+                path += 2;
+                goto eat_dot_dir;
+        }
+        if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
+            ) {
+                printk(KERN_ERR "nfs4_get_root:"
+                       " Mount path contains reference to \"..\"\n");
+                return -EINVAL;
+        }
+        /* lookup the next FH in the sequence */
+        memcpy(&lastfh, mntfh, sizeof(lastfh));
+        dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
+        ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
+                                                    mntfh, &fattr);
+        if (ret < 0) {
+                dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+                return ret;
+        }
+        if (fattr.type != NFDIR) {
+                printk(KERN_ERR "nfs4_get_root:"
+                       " lookupfh encountered non-directory\n");
+                return -ENOTDIR;
+        }
+        if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+                printk(KERN_ERR "nfs4_get_root:"
+                       " lookupfh obtained referral\n");
+                return -EREMOTE;
+        }
+        goto next_component;
+path_walk_complete:
+        memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+        dprintk("<-- nfs4_path_walk() = 0\n");
+        return 0;
+}
+/*
+ * get an NFS4 root dentry from the root filehandle
+ */
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+{
+        struct nfs_server *server = NFS_SB(sb);
+        struct nfs_fattr fattr;
+        struct dentry *mntroot;
+        struct inode *inode;
+        int error;
+        dprintk("--> nfs4_get_root()\n");
+        /* create a dummy root dentry with dummy inode for this superblock */
+        if (!sb->s_root) {
+                struct nfs_fh dummyfh;
+                struct dentry *root;
+                struct inode *iroot;
+                memset(&dummyfh, 0, sizeof(dummyfh));
+                memset(&fattr, 0, sizeof(fattr));
+                nfs_fattr_init(&fattr);
+                fattr.valid = NFS_ATTR_FATTR;
+                fattr.type = NFDIR;
+                fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
+                fattr.nlink = 2;
+                iroot = nfs_fhget(sb, &dummyfh, &fattr);
+                if (IS_ERR(iroot))
+                        return ERR_PTR(PTR_ERR(iroot));
+                root = d_alloc_root(iroot);
+                if (!root) {
+                        iput(iroot);
+                        return ERR_PTR(-ENOMEM);
+                }
+                sb->s_root = root;
+        }
+        /* get the info about the server and filesystem */
+        error = nfs4_server_capabilities(server, mntfh);
+        if (error < 0) {
+                dprintk("nfs_get_root: getcaps error = %d\n",
+                        -error);
+                return ERR_PTR(error);
+        }
+        /* get the actual root for this mount */
+        error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+        if (error < 0) {
+                dprintk("nfs_get_root: getattr error = %d\n", -error);
+                return ERR_PTR(error);
+        }
+        inode = nfs_fhget(sb, mntfh, &fattr);
+        if (IS_ERR(inode)) {
+                dprintk("nfs_get_root: get root inode failed\n");
+                return ERR_PTR(PTR_ERR(inode));
+        }
+        /* root dentries normally start off anonymous and get spliced in later
+         * if the dentry tree reaches them; however if the dentry already
+         * exists, we'll pick it up at this point and use it as the root
+         */
+        mntroot = d_alloc_anon(inode);
+        if (!mntroot) {
+                iput(inode);
+                dprintk("nfs_get_root: get root dentry failed\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        security_d_instantiate(mntroot, inode);
+        if (!mntroot->d_op)
+                mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+        dprintk("<-- nfs4_get_root()\n");
+        return mntroot;
+}
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 07a5dd57646e..82ad7110a1c0 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -57,6 +57,20 @@
 /* Default cache timeout is 10 minutes */
 unsigned int nfs_idmap_cache_timeout = 600 * HZ;
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+        char *endp;
+        int num = simple_strtol(val, &endp, 0);
+        int jif = num * HZ;
+        if (endp == val || *endp || num < 0 || jif < num)
+                return -EINVAL;
+        *((int *)kp->arg) = jif;
+        return 0;
+}
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+                 &nfs_idmap_cache_timeout, 0644);
 struct idmap_hashent {
        unsigned long ih_expires;
        __u32 ih_id;
@@ -70,7 +84,6 @@ struct idmap_hashtable {
 };
 struct idmap {
-        char                  idmap_path[48];
        struct dentry        *idmap_dentry;
        wait_queue_head_t     idmap_wq;
        struct idmap_msg      idmap_im;
@@ -94,24 +107,23 @@ static struct rpc_pipe_ops idmap_upcall_ops = {
        .destroy_msg    = idmap_pipe_destroy_msg,
 };
-void
+int
-nfs_idmap_new(struct nfs4_client *clp)
+nfs_idmap_new(struct nfs_client *clp)
 {
        struct idmap *idmap;
+        int error;
-        if (clp->cl_idmap != NULL)
+        BUG_ON(clp->cl_idmap != NULL);
-                return;
-        if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
-                return;
-        snprintf(idmap->idmap_path, sizeof(idmap->idmap_path),
+        if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
-            "%s/idmap", clp->cl_rpcclient->cl_pathname);
+                return -ENOMEM;
-        idmap->idmap_dentry = rpc_mkpipe(idmap->idmap_path,
+        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
            idmap, &idmap_upcall_ops, 0);
        if (IS_ERR(idmap->idmap_dentry)) {
+                error = PTR_ERR(idmap->idmap_dentry);
                kfree(idmap);
-                return;
+                return error;
        }
        mutex_init(&idmap->idmap_lock);
@@ -121,10 +133,11 @@ nfs_idmap_new(struct nfs4_client *clp)
        idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
        clp->cl_idmap = idmap;
+        return 0;
 }
 void
-nfs_idmap_delete(struct nfs4_client *clp)
+nfs_idmap_delete(struct nfs_client *clp)
 {
        struct idmap *idmap = clp->cl_idmap;
@@ -477,27 +490,27 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
        return (hash);
 }
-int nfs_map_name_to_uid(struct nfs4_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
-int nfs_map_group_to_gid(struct nfs4_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs4_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
 {
        struct idmap *idmap = clp->cl_idmap;
        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs4_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
 {
        struct idmap *idmap = clp->cl_idmap;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d349fb2245da..bc9376ca86cd 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -76,19 +76,14 @@ int nfs_write_inode(struct inode *inode, int sync)
 void nfs_clear_inode(struct inode *inode)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct rpc_cred *cred;
        /*
         * The following should never happen...
         */
        BUG_ON(nfs_have_writebacks(inode));
-        BUG_ON (!list_empty(&nfsi->open_files));
+        BUG_ON(!list_empty(&NFS_I(inode)->open_files));
+        BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0);
        nfs_zap_acl_cache(inode);
-        cred = nfsi->cache_access.cred;
+        nfs_access_zap_cache(inode);
-        if (cred)
-                put_rpccred(cred);
-        BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 /**
@@ -242,13 +237,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                /* Why so? Because we want revalidate for devices/FIFOs, and
                 * that's precisely what we have in nfs_file_inode_operations.
                 */
-                inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
+                inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
                if (S_ISREG(inode->i_mode)) {
                        inode->i_fop = &nfs_file_operations;
                        inode->i_data.a_ops = &nfs_file_aops;
                        inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
-                        inode->i_op = NFS_SB(sb)->rpc_ops->dir_inode_ops;
+                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
@@ -282,15 +277,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                         * report the blocks in 512byte units
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-                        inode->i_blksize = inode->i_sb->s_blocksize;
                } else {
                        inode->i_blocks = fattr->du.nfs2.blocks;
-                        inode->i_blksize = fattr->du.nfs2.blocksize;
                }
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = jiffies;
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-                nfsi->cache_access.cred = NULL;
+                nfsi->access_cache = RB_ROOT;
                unlock_new_inode(inode);
        } else
@@ -448,7 +441,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 {
        struct nfs_open_context *ctx;
-        ctx = (struct nfs_open_context *)kmalloc(sizeof(*ctx), GFP_KERNEL);
+        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx != NULL) {
                atomic_set(&ctx->count, 1);
                ctx->dentry = dget(dentry);
@@ -722,13 +715,11 @@ void nfs_end_data_update(struct inode *inode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if (!nfs_have_delegation(inode, FMODE_READ)) {
+        /* Directories: invalidate page cache */
-                /* Directories and symlinks: invalidate page cache */
+        if (S_ISDIR(inode->i_mode)) {
-                if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+                spin_lock(&inode->i_lock);
-                        spin_lock(&inode->i_lock);
+                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+                spin_unlock(&inode->i_lock);
-                        spin_unlock(&inode->i_lock);
-                }
        }
        nfsi->cache_change_attribute = jiffies;
        atomic_dec(&nfsi->data_updates);
@@ -847,6 +838,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 *
 * After an operation that has changed the inode metadata, mark the
 * attribute cache as being invalid, then try to update it.
+ *
+ * NB: if the server didn't return any post op attributes, this
+ * function will force the retrieval of attributes before the next
+ * NFS request.  Thus it should be used only for operations that
+ * are expected to change one or more attributes, to avoid
+ * unnecessary NFS requests and trips through nfs_update_inode().
 */
 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
@@ -970,10 +967,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                 * report the blocks in 512byte units
                 */
                inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-                inode->i_blksize = inode->i_sb->s_blocksize;
        } else {
                inode->i_blocks = fattr->du.nfs2.blocks;
-                inode->i_blksize = fattr->du.nfs2.blocksize;
        }
        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
@@ -1025,7 +1020,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 out_fileid:
        printk(KERN_ERR "NFS: server %s error: fileid changed\n"
                "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
-                NFS_SERVER(inode)->hostname, inode->i_sb->s_id,
+                NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id,
                (long long)nfsi->fileid, (long long)fattr->fileid);
        goto out_err;
 }
@@ -1109,6 +1104,8 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
                INIT_LIST_HEAD(&nfsi->dirty);
                INIT_LIST_HEAD(&nfsi->commit);
                INIT_LIST_HEAD(&nfsi->open_files);
+                INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+                INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
                INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
                atomic_set(&nfsi->data_updates, 0);
                nfsi->ndirty = 0;
@@ -1133,8 +1130,7 @@ static int __init nfs_init_inodecache(void)
 static void nfs_destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(nfs_inode_cachep))
+        kmem_cache_destroy(nfs_inode_cachep);
-                printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
 }
 /*
@@ -1144,6 +1140,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_fs_proc_init();
+        if (err)
+                goto out5;
        err = nfs_init_nfspagecache();
        if (err)
                goto out4;
@@ -1184,6 +1184,8 @@ out2:
 out3:
        nfs_destroy_nfspagecache();
 out4:
+        nfs_fs_proc_exit();
+out5:
        return err;
 }
@@ -1198,6 +1200,7 @@ static void __exit exit_nfs_fs(void)
        rpc_proc_unregister("nfs");
 #endif
        unregister_nfs_fs();
+        nfs_fs_proc_exit();
 }
 /* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e4f4e5def0fc..bea0b016bd70 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -4,6 +4,18 @@
 #include <linux/mount.h>
+struct nfs_string;
+struct nfs_mount_data;
+struct nfs4_mount_data;
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
 struct nfs_clone_mount {
        const struct super_block *sb;
        const struct dentry *dentry;
@@ -15,7 +27,40 @@ struct nfs_clone_mount {
        rpc_authflavor_t authflavor;
 };
-/* namespace-nfs4.c */
+/* client.c */
+extern struct rpc_program nfs_program;
+extern void nfs_put_client(struct nfs_client *);
+extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
+extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *,
+                                            struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *,
+                                             const char *,
+                                             const struct sockaddr_in *,
+                                             const char *,
+                                             const char *,
+                                             rpc_authflavor_t,
+                                             struct nfs_fh *);
+extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
+                                                      struct nfs_fh *);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+                                           struct nfs_fh *,
+                                           struct nfs_fattr *);
+#ifdef CONFIG_PROC_FS
+extern int __init nfs_fs_proc_init(void);
+extern void nfs_fs_proc_exit(void);
+#else
+static inline int nfs_fs_proc_init(void)
+{
+        return 0;
+}
+static inline void nfs_fs_proc_exit(void)
+{
+}
+#endif
+/* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
 extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
 #else
@@ -46,6 +91,7 @@ extern void nfs_destroy_directcache(void);
 #endif
 /* nfs2xdr.c */
+extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
 extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
@@ -54,8 +100,9 @@ extern struct rpc_procinfo nfs3_procedures[];
 extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
 /* nfs4xdr.c */
-extern int nfs_stat_to_errno(int);
+#ifdef CONFIG_NFS_V4
 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+#endif
 /* nfs4proc.c */
 #ifdef CONFIG_NFS_V4
@@ -66,6 +113,9 @@ extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
                                  struct page *page);
 #endif
+/* dir.c */
+extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 /* inode.c */
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
@@ -76,10 +126,10 @@ extern void nfs4_clear_inode(struct inode *);
 #endif
 /* super.c */
-extern struct file_system_type nfs_referral_nfs4_fs_type;
+extern struct file_system_type nfs_xdev_fs_type;
-extern struct file_system_type clone_nfs_fs_type;
 #ifdef CONFIG_NFS_V4
-extern struct file_system_type clone_nfs4_fs_type;
+extern struct file_system_type nfs4_xdev_fs_type;
+extern struct file_system_type nfs4_referral_fs_type;
 #endif
 extern struct rpc_stat nfs_rpcstat;
@@ -88,30 +138,30 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 /* namespace.c */
-extern char *nfs_path(const char *base, const struct dentry *dentry,
+extern char *nfs_path(const char *base,
+                      const struct dentry *droot,
+                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
-/*
+/* getroot.c */
- * Determine the mount path as a string
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
- */
-static inline char *
-nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
-{
 #ifdef CONFIG_NFS_V4
-        return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
-#else
-        return NULL;
+extern int nfs4_path_walk(struct nfs_server *server,
+                          struct nfs_fh *mntfh,
+                          const char *path);
 #endif
-}
 /*
 * Determine the device name as a string
 */
 static inline char *nfs_devname(const struct vfsmount *mnt_parent,
-                         const struct dentry *dentry,
+                                const struct dentry *dentry,
-                         char *buffer, ssize_t buflen)
+                                char *buffer, ssize_t buflen)
 {
-        return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+        return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root,
+                        dentry, buffer, buflen);
 }
 /*
@@ -167,20 +217,3 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
        if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
                sb->s_maxbytes = MAX_LFS_FILESIZE;
 }
-/*
- * Check if the string represents a "valid" IPv4 address
- */
-static inline int valid_ipaddr4(const char *buf)
-{
-        int rc, count, in[4];
-        rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
-        if (rc != 4)
-                return -EINVAL;
-        for (count = 0; count < 4; count++) {
-                if (in[count] > 255)
-                        return -EINVAL;
-        }
-        return 0;
-}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 445abb4d4214..d507b021207f 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,7 +14,6 @@
 #include <linux/net.h>
 #include <linux/in.h>
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/nfs_fs.h>
@@ -77,22 +76,19 @@ static struct rpc_clnt *
 mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
                int protocol)
 {
-        struct rpc_xprt *xprt;
+        struct rpc_create_args args = {
-        struct rpc_clnt *clnt;
+                .protocol       = protocol,
+                .address        = (struct sockaddr *)srvaddr,
-        xprt = xprt_create_proto(protocol, srvaddr, NULL);
+                .addrsize       = sizeof(*srvaddr),
-        if (IS_ERR(xprt))
+                .servername     = hostname,
-                return (struct rpc_clnt *)xprt;
+                .program        = &mnt_program,
+                .version        = version,
-        clnt = rpc_create_client(xprt, hostname,
+                .authflavor     = RPC_AUTH_UNIX,
-                                &mnt_program, version,
+                .flags          = (RPC_CLNT_CREATE_ONESHOT |
-                                RPC_AUTH_UNIX);
+                                   RPC_CLNT_CREATE_INTR),
-        if (!IS_ERR(clnt)) {
+        };
-                clnt->cl_softrtry = 1;
-                clnt->cl_oneshot  = 1;
+        return rpc_create(&args);
-                clnt->cl_intr = 1;
-        }
-        return clnt;
 }
 /*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 86b3169c8cac..60408646176b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -2,6 +2,7 @@
 * linux/fs/nfs/namespace.c
 *
 * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
 *
 * NFS namespace
 */
@@ -25,9 +26,15 @@ LIST_HEAD(nfs_automount_list);
 static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
+static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+                                        const struct dentry *dentry,
+                                        struct nfs_fh *fh,
+                                        struct nfs_fattr *fattr);
 /*
 * nfs_path - reconstruct the path given an arbitrary dentry
 * @base - arbitrary string to prepend to the path
+ * @droot - pointer to root dentry for mountpoint
 * @dentry - pointer to dentry
 * @buffer - result buffer
 * @buflen - length of buffer
@@ -38,7 +45,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
 * This is mainly for use in figuring out the path on the
 * server side when automounting on top of an existing partition.
 */
-char *nfs_path(const char *base, const struct dentry *dentry,
+char *nfs_path(const char *base,
+               const struct dentry *droot,
+               const struct dentry *dentry,
               char *buffer, ssize_t buflen)
 {
        char *end = buffer+buflen;
@@ -47,7 +56,7 @@ char *nfs_path(const char *base, const struct dentry *dentry,
        *--end = '\0';
        buflen--;
        spin_lock(&dcache_lock);
-        while (!IS_ROOT(dentry)) {
+        while (!IS_ROOT(dentry) && dentry != droot) {
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
                if (buflen < 0)
@@ -96,15 +105,18 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        struct nfs_fattr fattr;
        int err;
+        dprintk("--> nfs_follow_mountpoint()\n");
        BUG_ON(IS_ROOT(dentry));
        dprintk("%s: enter\n", __FUNCTION__);
        dput(nd->dentry);
        nd->dentry = dget(dentry);
-        if (d_mountpoint(nd->dentry))
-                goto out_follow;
        /* Look it up again */
        parent = dget_parent(nd->dentry);
-        err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
+                                                  &nd->dentry->d_name,
+                                                  &fh, &fattr);
        dput(parent);
        if (err != 0)
                goto out_err;
@@ -132,6 +144,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
        schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
        dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+        dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
        return ERR_PTR(err);
 out_err:
        path_release(nd);
@@ -172,22 +186,23 @@ void nfs_release_automount_timer(void)
 /*
 * Clone a mountpoint of the appropriate type
 */
-static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
+                                           const char *devname,
                                           struct nfs_clone_mount *mountdata)
 {
 #ifdef CONFIG_NFS_V4
        struct vfsmount *mnt = NULL;
-        switch (server->rpc_ops->version) {
+        switch (server->nfs_client->cl_nfsversion) {
                case 2:
                case 3:
-                        mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+                        mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
                        break;
                case 4:
-                        mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+                        mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata);
        }
        return mnt;
 #else
-        return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+        return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
 #endif
 }
@@ -199,9 +214,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devn
 * @fattr - attributes for new root inode
 *
 */
-struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
-                const struct dentry *dentry, struct nfs_fh *fh,
+                                        const struct dentry *dentry,
-                struct nfs_fattr *fattr)
+                                        struct nfs_fh *fh,
+                                        struct nfs_fattr *fattr)
 {
        struct nfs_clone_mount mountdata = {
                .sb = mnt_parent->mnt_sb,
@@ -213,6 +229,8 @@ struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
        char *page = (char *) __get_free_page(GFP_USER);
        char *devname;
+        dprintk("--> nfs_do_submount()\n");
        dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
                        dentry->d_parent->d_name.name,
                        dentry->d_name.name);
@@ -227,5 +245,7 @@ free_page:
        free_page((unsigned long)page);
 out:
        dprintk("%s: done\n", __FUNCTION__);
+        dprintk("<-- nfs_do_submount() = %p\n", mnt);
        return mnt;
 }
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 67391eef6b93..b49501fc0a79 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -51,7 +51,7 @@
 #define NFS_createargs_sz       (NFS_diropargs_sz+NFS_sattr_sz)
 #define NFS_renameargs_sz       (NFS_diropargs_sz+NFS_diropargs_sz)
 #define NFS_linkargs_sz         (NFS_fhandle_sz+NFS_diropargs_sz)
-#define NFS_symlinkargs_sz      (NFS_diropargs_sz+NFS_path_sz+NFS_sattr_sz)
+#define NFS_symlinkargs_sz      (NFS_diropargs_sz+1+NFS_sattr_sz)
 #define NFS_readdirargs_sz      (NFS_fhandle_sz+2)
 #define NFS_attrstat_sz         (1+NFS_fattr_sz)
@@ -351,11 +351,26 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
 static int
 nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
 {
+        struct xdr_buf *sndbuf = &req->rq_snd_buf;
+        size_t pad;
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_array(p, args->fromname, args->fromlen);
-        p = xdr_encode_array(p, args->topath, args->tolen);
+        *p++ = htonl(args->pathlen);
+        sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+        xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+        /*
+         * xdr_encode_pages may have added a few bytes to ensure the
+         * pathname ends on a 4-byte boundary.  Start encoding the
+         * attributes after the pad bytes.
+         */
+        pad = sndbuf->tail->iov_len;
+        if (pad > 0)
+                p++;
        p = xdr_encode_sattr(p, args->sattr);
-        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
        return 0;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7143b1f82cea..3b234d4601e7 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -81,7 +81,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
 }
 /*
- * Bare-bones access to getattr: this is for nfs_read_super.
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
 */
 static int
 nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -90,8 +90,8 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        int     status;
        status = do_proc_get_root(server->client, fhandle, info);
-        if (status && server->client_sys != server->client)
+        if (status && server->nfs_client->cl_rpcclient != server->client)
-                status = do_proc_get_root(server->client_sys, fhandle, info);
+                status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
        return status;
 }
@@ -449,7 +449,7 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr
                struct nfs_fattr res;
        } *ptr;
-        ptr = (struct unlinkxdr *)kmalloc(sizeof(*ptr), GFP_KERNEL);
+        ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
        if (!ptr)
                return -ENOMEM;
        ptr->arg.fh = NFS_FH(dir->d_inode);
@@ -544,23 +544,23 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 }
 static int
-nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
-                  struct iattr *sattr, struct nfs_fh *fhandle,
+                  unsigned int len, struct iattr *sattr)
-                  struct nfs_fattr *fattr)
 {
-        struct nfs_fattr        dir_attr;
+        struct nfs_fh fhandle;
+        struct nfs_fattr fattr, dir_attr;
        struct nfs3_symlinkargs arg = {
                .fromfh         = NFS_FH(dir),
-                .fromname       = name->name,
+                .fromname       = dentry->d_name.name,
-                .fromlen        = name->len,
+                .fromlen        = dentry->d_name.len,
-                .topath         = path->name,
+                .pages          = &page,
-                .tolen          = path->len,
+                .pathlen        = len,
                .sattr          = sattr
        };
        struct nfs3_diropres    res = {
                .dir_attr       = &dir_attr,
-                .fh             = fhandle,
+                .fh             = &fhandle,
-                .fattr          = fattr
+                .fattr          = &fattr
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs3_procedures[NFS3PROC_SYMLINK],
@@ -569,13 +569,19 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
        };
        int                     status;
-        if (path->len > NFS3_MAXPATHLEN)
+        if (len > NFS3_MAXPATHLEN)
                return -ENAMETOOLONG;
-        dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
+        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
        nfs_fattr_init(&dir_attr);
-        nfs_fattr_init(fattr);
+        nfs_fattr_init(&fattr);
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_post_op_update_inode(dir, &dir_attr);
+        if (status != 0)
+                goto out;
+        status = nfs_instantiate(dentry, &fhandle, &fattr);
+out:
        dprintk("NFS reply symlink: %d\n", status);
        return status;
 }
@@ -785,7 +791,7 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
        dprintk("NFS call  fsinfo\n");
        nfs_fattr_init(info->fattr);
-        status = rpc_call_sync(server->client_sys, &msg, 0);
+        status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
        dprintk("NFS reply fsinfo: %d\n", status);
        return status;
 }
@@ -886,7 +892,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
        return nlmclnt_proc(filp->f_dentry->d_inode, cmd, fl);
 }
-struct nfs_rpc_ops      nfs_v3_clientops = {
+const struct nfs_rpc_ops nfs_v3_clientops = {
        .version        = 3,                    /* protocol version */
        .dentry_ops     = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs3_dir_inode_operations,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0250269e9753..16556fa4effb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -56,7 +56,7 @@
 #define NFS3_writeargs_sz       (NFS3_fh_sz+5)
 #define NFS3_createargs_sz      (NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_mkdirargs_sz       (NFS3_diropargs_sz+NFS3_sattr_sz)
-#define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+NFS3_path_sz+NFS3_sattr_sz)
+#define NFS3_symlinkargs_sz     (NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz       (NFS3_diropargs_sz+2+NFS3_sattr_sz)
 #define NFS3_renameargs_sz      (NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz                (NFS3_fh_sz+NFS3_diropargs_sz)
@@ -398,8 +398,11 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args
        p = xdr_encode_fhandle(p, args->fromfh);
        p = xdr_encode_array(p, args->fromname, args->fromlen);
        p = xdr_encode_sattr(p, args->sattr);
-        p = xdr_encode_array(p, args->topath, args->tolen);
+        *p++ = htonl(args->pathlen);
        req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+        /* Copy the page */
+        xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
        return 0;
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9a102860df37..61095fe4b5ca 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -43,55 +43,6 @@ enum nfs4_client_state {
 };
 /*
- * The nfs4_client identifies our client state to the server.
- */
-struct nfs4_client {
-        struct list_head        cl_servers;     /* Global list of servers */
-        struct in_addr          cl_addr;        /* Server identifier */
-        u64                     cl_clientid;    /* constant */
-        nfs4_verifier           cl_confirm;
-        unsigned long           cl_state;
-        u32                     cl_lockowner_id;
-        /*
-         * The following rwsem ensures exclusive access to the server
-         * while we recover the state following a lease expiration.
-         */
-        struct rw_semaphore     cl_sem;
-        struct list_head        cl_delegations;
-        struct list_head        cl_state_owners;
-        struct list_head        cl_unused;
-        int                     cl_nunused;
-        spinlock_t              cl_lock;
-        atomic_t                cl_count;
-        struct rpc_clnt *       cl_rpcclient;
-        struct list_head        cl_superblocks; /* List of nfs_server structs */
-        unsigned long           cl_lease_time;
-        unsigned long           cl_last_renewal;
-        struct work_struct      cl_renewd;
-        struct work_struct      cl_recoverd;
-        struct rpc_wait_queue   cl_rpcwaitq;
-        /* used for the setclientid verifier */
-        struct timespec         cl_boot_time;
-        /* idmapper */
-        struct idmap *          cl_idmap;
-        /* Our own IP address, as a null-terminated string.
-         * This is used to generate the clientid, and the callback address.
-         */
-        char                    cl_ipaddr[16];
-        unsigned char           cl_id_uniquifier;
-};
-/*
 * struct rpc_sequence ensures that RPC calls are sent in the exact
 * order that they appear on the list.
 */
@@ -127,7 +78,7 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
 struct nfs4_state_owner {
        spinlock_t           so_lock;
        struct list_head     so_list;    /* per-clientid list of state_owners */
-        struct nfs4_client   *so_client;
+        struct nfs_client    *so_client;
        u32                  so_id;      /* 32-bit identifier, unique */
        atomic_t             so_count;
@@ -210,10 +161,10 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
 extern int nfs4_map_errors(int err);
-extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
-extern int nfs4_proc_setclientid_confirm(struct nfs4_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *);
+extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
+extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -231,19 +182,14 @@ extern const u32 nfs4_fsinfo_bitmap[2];
 extern const u32 nfs4_fs_locations_bitmap[2];
 /* nfs4renewd.c */
-extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+extern void nfs4_schedule_state_renewal(struct nfs_client *);
 extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
-extern void nfs4_kill_renewd(struct nfs4_client *);
+extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(void *);
 /* nfs4state.c */
-extern void init_nfsv4_state(struct nfs_server *);
+struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
-extern void destroy_nfsv4_state(struct nfs_server *);
+extern u32 nfs4_alloc_lockowner_id(struct nfs_client *);
-extern struct nfs4_client *nfs4_get_client(struct in_addr *);
-extern void nfs4_put_client(struct nfs4_client *clp);
-extern struct nfs4_client *nfs4_find_client(struct in_addr *);
-struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp);
-extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
@@ -252,7 +198,7 @@ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, mode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
-extern void nfs4_schedule_state_recovery(struct nfs4_client *);
+extern void nfs4_schedule_state_recovery(struct nfs_client *);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
@@ -276,10 +222,6 @@ extern struct svc_version nfs4_callback_version1;
 #else
-#define init_nfsv4_state(server)  do { } while (0)
-#define destroy_nfsv4_state(server)       do { } while (0)
-#define nfs4_put_state_owner(inode, owner) do { } while (0)
-#define nfs4_put_open_state(state) do { } while (0)
 #define nfs4_close_state(a, b) do { } while (0)
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index ea38d27b74e6..24e47f3bbd17 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -2,6 +2,7 @@
 * linux/fs/nfs/nfs4namespace.c
 *
 * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
 *
 * NFSv4 namespace
 */
@@ -23,7 +24,7 @@
 /*
 * Check if fs_root is valid
 */
-static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
+static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
                                         char *buffer, ssize_t buflen)
 {
        char *end = buffer + buflen;
@@ -34,7 +35,7 @@ static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
        n = pathname->ncomponents;
        while (--n >= 0) {
-                struct nfs4_string *component = &pathname->components[n];
+                const struct nfs4_string *component = &pathname->components[n];
                buflen -= component->len + 1;
                if (buflen < 0)
                        goto Elong;
@@ -47,6 +48,68 @@ Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
+/*
+ * Determine the mount path as a string
+ */
+static char *nfs4_path(const struct vfsmount *mnt_parent,
+                       const struct dentry *dentry,
+                       char *buffer, ssize_t buflen)
+{
+        const char *srvpath;
+        srvpath = strchr(mnt_parent->mnt_devname, ':');
+        if (srvpath)
+                srvpath++;
+        else
+                srvpath = mnt_parent->mnt_devname;
+        return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
+}
+/*
+ * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
+ * believe to be the server path to this dentry
+ */
+static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
+                                const struct dentry *dentry,
+                                const struct nfs4_fs_locations *locations,
+                                char *page, char *page2)
+{
+        const char *path, *fs_path;
+        path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+        if (IS_ERR(fs_path))
+                return PTR_ERR(fs_path);
+        if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+                dprintk("%s: path %s does not begin with fsroot %s\n",
+                        __FUNCTION__, path, fs_path);
+                return -ENOENT;
+        }
+        return 0;
+}
+/*
+ * Check if the string represents a "valid" IPv4 address
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+        int rc, count, in[4];
+        rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+        if (rc != 4)
+                return -EINVAL;
+        for (count = 0; count < 4; count++) {
+                if (in[count] > 255)
+                        return -EINVAL;
+        }
+        return 0;
+}
 /**
 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
@@ -60,7 +123,7 @@ Elong:
 */
 static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                                            const struct dentry *dentry,
-                                            struct nfs4_fs_locations *locations)
+                                            const struct nfs4_fs_locations *locations)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        struct nfs_clone_mount mountdata = {
@@ -68,10 +131,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                .dentry = dentry,
                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
        };
-        char *page, *page2;
+        char *page = NULL, *page2 = NULL;
-        char *path, *fs_path;
        char *devname;
-        int loc, s;
+        int loc, s, error;
        if (locations == NULL || locations->nlocations <= 0)
                goto out;
@@ -79,36 +141,30 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
        dprintk("%s: referral at %s/%s\n", __FUNCTION__,
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        /* Ensure fs path is a prefix of current dentry path */
        page = (char *) __get_free_page(GFP_USER);
-        if (page == NULL)
+        if (!page)
                goto out;
        page2 = (char *) __get_free_page(GFP_USER);
-        if (page2 == NULL)
+        if (!page2)
                goto out;
-        path = nfs4_path(dentry, page, PAGE_SIZE);
+        /* Ensure fs path is a prefix of current dentry path */
-        if (IS_ERR(path))
+        error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2);
-                goto out_free;
+        if (error < 0) {
+                mnt = ERR_PTR(error);
-        fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+                goto out;
-        if (IS_ERR(fs_path))
-                goto out_free;
-        if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
-                dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
-                goto out_free;
        }
        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
        if (IS_ERR(devname)) {
                mnt = (struct vfsmount *)devname;
-                goto out_free;
+                goto out;
        }
        loc = 0;
        while (loc < locations->nlocations && IS_ERR(mnt)) {
-                struct nfs4_fs_location *location = &locations->locations[loc];
+                const struct nfs4_fs_location *location = &locations->locations[loc];
                char *mnt_path;
                if (location == NULL || location->nservers <= 0 ||
@@ -140,7 +196,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                        addr.sin_port = htons(NFS_PORT);
                        mountdata.addr = &addr;
-                        mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+                        mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, devname, &mountdata);
                        if (!IS_ERR(mnt)) {
                                break;
                        }
@@ -149,10 +205,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                loc++;
        }
-out_free:
-        free_page((unsigned long)page);
-        free_page((unsigned long)page2);
 out:
+        free_page((unsigned long) page);
+        free_page((unsigned long) page2);
        dprintk("%s: done\n", __FUNCTION__);
        return mnt;
 }
@@ -165,7 +220,7 @@ out:
 */
 struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
 {
-        struct vfsmount *mnt = ERR_PTR(-ENOENT);
+        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
        struct dentry *parent;
        struct nfs4_fs_locations *fs_locations = NULL;
        struct page *page;
@@ -183,11 +238,16 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
                goto out_free;
        /* Get locations */
+        mnt = ERR_PTR(-ENOENT);
        parent = dget_parent(dentry);
-        dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+        dprintk("%s: getting locations for %s/%s\n",
+                __FUNCTION__, parent->d_name.name, dentry->d_name.name);
        err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
        dput(parent);
-        if (err != 0 || fs_locations->nlocations <= 0 ||
+        if (err != 0 ||
+            fs_locations->nlocations <= 0 ||
            fs_locations->fs_path.ncomponents <= 0)
                goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b14145b7b87f..47c7e6e3910d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -55,7 +55,7 @@
 #define NFSDBG_FACILITY         NFSDBG_PROC
-#define NFS4_POLL_RETRY_MIN     (1*HZ)
+#define NFS4_POLL_RETRY_MIN     (HZ/10)
 #define NFS4_POLL_RETRY_MAX     (15*HZ)
 struct nfs4_opendata;
@@ -64,7 +64,7 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
+static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
@@ -195,7 +195,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&clp->cl_lock);
        if (time_before(clp->cl_last_renewal,timestamp))
                clp->cl_last_renewal = timestamp;
@@ -252,7 +252,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        atomic_inc(&sp->so_count);
        p->o_arg.fh = NFS_FH(dir);
        p->o_arg.open_flags = flags,
-        p->o_arg.clientid = server->nfs4_state->cl_clientid;
+        p->o_arg.clientid = server->nfs_client->cl_clientid;
        p->o_arg.id = sp->so_id;
        p->o_arg.name = &dentry->d_name;
        p->o_arg.server = server;
@@ -550,7 +550,7 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
-                                nfs4_schedule_state_recovery(server->nfs4_state);
+                                nfs4_schedule_state_recovery(server->nfs_client);
                                return err;
                }
                err = nfs4_handle_exception(server, err, &exception);
@@ -758,7 +758,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        }
        nfs_confirm_seqid(&data->owner->so_seqid, 0);
        if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
-                return server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
+                return server->nfs_client->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
        return 0;
 }
@@ -792,11 +792,18 @@ out:
 int nfs4_recover_expired_lease(struct nfs_server *server)
 {
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
+        int ret;
-        if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+        for (;;) {
+                ret = nfs4_wait_clnt_recover(server->client, clp);
+                if (ret != 0)
+                        return ret;
+                if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+                        break;
                nfs4_schedule_state_recovery(clp);
-        return nfs4_wait_clnt_recover(server->client, clp);
+        }
+        return 0;
 }
 /*
@@ -867,7 +874,7 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 {
        struct nfs_delegation *delegation;
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs4_state_owner *sp = NULL;
        struct nfs4_state *state = NULL;
@@ -953,7 +960,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
        struct nfs4_state_owner  *sp;
        struct nfs4_state     *state = NULL;
        struct nfs_server       *server = NFS_SERVER(dir);
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_opendata *opendata;
        int                     status;
@@ -1133,7 +1140,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        break;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
-                        nfs4_schedule_state_recovery(server->nfs4_state);
+                        nfs4_schedule_state_recovery(server->nfs_client);
                        break;
                default:
                        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
@@ -1268,7 +1275,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                BUG_ON(nd->intent.open.flags & O_CREAT);
        }
-        cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+        cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
        if (IS_ERR(cred))
                return (struct dentry *)cred;
        state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred);
@@ -1291,7 +1298,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        struct rpc_cred *cred;
        struct nfs4_state *state;
-        cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+        cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
        if (IS_ERR(cred))
                return PTR_ERR(cred);
        state = nfs4_open_delegated(dentry->d_inode, openflags, cred);
@@ -1393,70 +1400,19 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
+/*
+ * get the file handle for the "/" directory on the server
+ */
 static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
-                struct nfs_fsinfo *info)
+                              struct nfs_fsinfo *info)
 {
-        struct nfs_fattr *      fattr = info->fattr;
-        unsigned char *         p;
-        struct qstr             q;
-        struct nfs4_lookup_arg args = {
-                .dir_fh = fhandle,
-                .name = &q,
-                .bitmask = nfs4_fattr_bitmap,
-        };
-        struct nfs4_lookup_res res = {
-                .server = server,
-                .fattr = fattr,
-                .fh = fhandle,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
-                .rpc_argp = &args,
-                .rpc_resp = &res,
-        };
        int status;
-        /*
-         * Now we do a separate LOOKUP for each component of the mount path.
-         * The LOOKUPs are done separately so that we can conveniently
-         * catch an ERR_WRONGSEC if it occurs along the way...
-         */
        status = nfs4_lookup_root(server, fhandle, info);
-        if (status)
-                goto out;
-        p = server->mnt_path;
-        for (;;) {
-                struct nfs4_exception exception = { };
-                while (*p == '/')
-                        p++;
-                if (!*p)
-                        break;
-                q.name = p;
-                while (*p && (*p != '/'))
-                        p++;
-                q.len = p - q.name;
-                do {
-                        nfs_fattr_init(fattr);
-                        status = nfs4_handle_exception(server,
-                                        rpc_call_sync(server->client, &msg, 0),
-                                        &exception);
-                } while (exception.retry);
-                if (status == 0)
-                        continue;
-                if (status == -ENOENT) {
-                        printk(KERN_NOTICE "NFS: mount path %s does not exist!\n", server->mnt_path);
-                        printk(KERN_NOTICE "NFS: suggestion: try mounting '/' instead.\n");
-                }
-                break;
-        }
        if (status == 0)
                status = nfs4_server_capabilities(server, fhandle);
        if (status == 0)
                status = nfs4_do_fsinfo(server, fhandle, info);
-out:
        return nfs4_map_errors(status);
 }
@@ -1565,7 +1521,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        nfs_fattr_init(fattr);
        
-        cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+        cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
        if (IS_ERR(cred))
                return PTR_ERR(cred);
@@ -1583,6 +1539,52 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        return status;
 }
+static int _nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
+                struct qstr *name, struct nfs_fh *fhandle,
+                struct nfs_fattr *fattr)
+{
+        int                    status;
+        struct nfs4_lookup_arg args = {
+                .bitmask = server->attr_bitmask,
+                .dir_fh = dirfh,
+                .name = name,
+        };
+        struct nfs4_lookup_res res = {
+                .server = server,
+                .fattr = fattr,
+                .fh = fhandle,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        nfs_fattr_init(fattr);
+        dprintk("NFS call  lookupfh %s\n", name->name);
+        status = rpc_call_sync(server->client, &msg, 0);
+        dprintk("NFS reply lookupfh: %d\n", status);
+        if (status == -NFS4ERR_MOVED)
+                status = -EREMOTE;
+        return status;
+}
+static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
+                              struct qstr *name, struct nfs_fh *fhandle,
+                              struct nfs_fattr *fattr)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(server,
+                                _nfs4_proc_lookupfh(server, dirfh, name,
+                                                    fhandle, fattr),
+                                &exception);
+        } while (exception.retry);
+        return err;
+}
 static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
@@ -1881,7 +1883,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        struct rpc_cred *cred;
        int status = 0;
-        cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+        cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0);
        if (IS_ERR(cred)) {
                status = PTR_ERR(cred);
                goto out;
@@ -2089,24 +2091,24 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
        return err;
 }
-static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
+static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-                struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
+                struct page *page, unsigned int len, struct iattr *sattr)
-                struct nfs_fattr *fattr)
 {
        struct nfs_server *server = NFS_SERVER(dir);
-        struct nfs_fattr dir_fattr;
+        struct nfs_fh fhandle;
+        struct nfs_fattr fattr, dir_fattr;
        struct nfs4_create_arg arg = {
                .dir_fh = NFS_FH(dir),
                .server = server,
-                .name = name,
+                .name = &dentry->d_name,
                .attrs = sattr,
                .ftype = NF4LNK,
                .bitmask = server->attr_bitmask,
        };
        struct nfs4_create_res res = {
                .server = server,
-                .fh = fhandle,
+                .fh = &fhandle,
-                .fattr = fattr,
+                .fattr = &fattr,
                .dir_fattr = &dir_fattr,
        };
        struct rpc_message msg = {
@@ -2116,29 +2118,32 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
        };
        int                     status;
-        if (path->len > NFS4_MAXPATHLEN)
+        if (len > NFS4_MAXPATHLEN)
                return -ENAMETOOLONG;
-        arg.u.symlink = path;
-        nfs_fattr_init(fattr);
+        arg.u.symlink.pages = &page;
+        arg.u.symlink.len = len;
+        nfs_fattr_init(&fattr);
        nfs_fattr_init(&dir_fattr);
        
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-        if (!status)
+        if (!status) {
                update_changeattr(dir, &res.dir_cinfo);
-        nfs_post_op_update_inode(dir, res.dir_fattr);
+                nfs_post_op_update_inode(dir, res.dir_fattr);
+                status = nfs_instantiate(dentry, &fhandle, &fattr);
+        }
        return status;
 }
-static int nfs4_proc_symlink(struct inode *dir, struct qstr *name,
+static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-                struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
+                struct page *page, unsigned int len, struct iattr *sattr)
-                struct nfs_fattr *fattr)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
-                                _nfs4_proc_symlink(dir, name, path, sattr,
+                                _nfs4_proc_symlink(dir, dentry, page,
-                                        fhandle, fattr),
+                                                        len, sattr),
                                &exception);
        } while (exception.retry);
        return err;
@@ -2521,7 +2526,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
 */
 static void nfs4_renew_done(struct rpc_task *task, void *data)
 {
-        struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
+        struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp;
        unsigned long timestamp = (unsigned long)data;
        if (task->tk_status < 0) {
@@ -2543,7 +2548,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
        .rpc_call_done = nfs4_renew_done,
 };
-int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred)
+int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -2555,7 +2560,7 @@ int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred)
                        &nfs4_renew_ops, (void *)jiffies);
 }
-int nfs4_proc_renew(struct nfs4_client *clp, struct rpc_cred *cred)
+int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -2770,7 +2775,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                return -EOPNOTSUPP;
        nfs_inode_return_delegation(inode);
        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
-        ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
+        ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
        if (ret == 0)
                nfs4_write_cached_acl(inode, buf, buflen);
        return ret;
@@ -2791,7 +2796,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 static int
 nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 {
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        if (!clp || task->tk_status >= 0)
                return 0;
@@ -2828,7 +2833,7 @@ static int nfs4_wait_bit_interruptible(void *word)
        return 0;
 }
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
+static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
 {
        sigset_t oldset;
        int res;
@@ -2871,7 +2876,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 */
 int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        int ret = errorcode;
        exception->retry = 0;
@@ -2886,6 +2891,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
                        if (ret == 0)
                                exception->retry = 1;
                        break;
+                case -NFS4ERR_FILE_OPEN:
                case -NFS4ERR_GRACE:
                case -NFS4ERR_DELAY:
                        ret = nfs4_delay(server->client, &exception->timeout);
@@ -2898,7 +2904,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
        return nfs4_map_errors(ret);
 }
-int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
        struct nfs4_setclientid setclientid = {
@@ -2922,7 +2928,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
        for(;;) {
                setclientid.sc_name_len = scnprintf(setclientid.sc_name,
                                sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
-                                clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr),
+                                clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr),
                                cred->cr_ops->cr_name,
                                clp->cl_id_uniquifier);
                setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
@@ -2945,7 +2951,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
        return status;
 }
-static int _nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct nfs_fsinfo fsinfo;
        struct rpc_message msg = {
@@ -2969,7 +2975,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cr
        return status;
 }
-int nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
        long timeout;
        int err;
@@ -3077,7 +3083,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
                switch (err) {
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
-                                nfs4_schedule_state_recovery(server->nfs4_state);
+                                nfs4_schedule_state_recovery(server->nfs_client);
                        case 0:
                                return 0;
                }
@@ -3106,7 +3112,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 {
        struct inode *inode = state->inode;
        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        struct nfs_lockt_args arg = {
                .fh = NFS_FH(inode),
                .fl = request,
@@ -3231,7 +3237,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                        break;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
-                        nfs4_schedule_state_recovery(calldata->server->nfs4_state);
+                        nfs4_schedule_state_recovery(calldata->server->nfs_client);
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) {
@@ -3343,7 +3349,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        if (p->arg.lock_seqid == NULL)
                goto out_free;
        p->arg.lock_stateid = &lsp->ls_stateid;
-        p->arg.lock_owner.clientid = server->nfs4_state->cl_clientid;
+        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_id;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
@@ -3513,7 +3519,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs4_client *clp = state->owner->so_client;
+        struct nfs_client *clp = state->owner->so_client;
        unsigned char fl_flags = request->fl_flags;
        int status;
@@ -3715,7 +3721,7 @@ static struct inode_operations nfs4_file_inode_operations = {
        .listxattr      = nfs4_listxattr,
 };
-struct nfs_rpc_ops      nfs_v4_clientops = {
+const struct nfs_rpc_ops nfs_v4_clientops = {
        .version        = 4,                    /* protocol version */
        .dentry_ops     = &nfs4_dentry_operations,
        .dir_inode_ops  = &nfs4_dir_inode_operations,
@@ -3723,6 +3729,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
        .getroot        = nfs4_proc_get_root,
        .getattr        = nfs4_proc_getattr,
        .setattr        = nfs4_proc_setattr,
+        .lookupfh       = nfs4_proc_lookupfh,
        .lookup         = nfs4_proc_lookup,
        .access         = nfs4_proc_access,
        .readlink       = nfs4_proc_readlink,
@@ -3743,6 +3750,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
        .statfs         = nfs4_proc_statfs,
        .fsinfo         = nfs4_proc_fsinfo,
        .pathconf       = nfs4_proc_pathconf,
+        .set_capabilities = nfs4_server_capabilities,
        .decode_dirent  = nfs4_decode_dirent,
        .read_setup     = nfs4_proc_read_setup,
        .read_done      = nfs4_read_done,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 5d764d8e6d8a..7b6df1852e75 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -61,7 +61,7 @@
 void
 nfs4_renew_state(void *data)
 {
-        struct nfs4_client *clp = (struct nfs4_client *)data;
+        struct nfs_client *clp = (struct nfs_client *)data;
        struct rpc_cred *cred;
        long lease, timeout;
        unsigned long last, now;
@@ -108,7 +108,7 @@ out:
 /* Must be called with clp->cl_sem locked for writes */
 void
-nfs4_schedule_state_renewal(struct nfs4_client *clp)
+nfs4_schedule_state_renewal(struct nfs_client *clp)
 {
        long timeout;
@@ -121,32 +121,20 @@ nfs4_schedule_state_renewal(struct nfs4_client *clp)
                        __FUNCTION__, (timeout + HZ - 1) / HZ);
        cancel_delayed_work(&clp->cl_renewd);
        schedule_delayed_work(&clp->cl_renewd, timeout);
+        set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
        spin_unlock(&clp->cl_lock);
 }
 void
 nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 {
-        struct nfs4_client *clp = server->nfs4_state;
-        if (!clp)
-                return;
        flush_scheduled_work();
-        down_write(&clp->cl_sem);
-        if (!list_empty(&server->nfs4_siblings))
-                list_del_init(&server->nfs4_siblings);
-        up_write(&clp->cl_sem);
 }
-/* Must be called with clp->cl_sem locked for writes */
 void
-nfs4_kill_renewd(struct nfs4_client *clp)
+nfs4_kill_renewd(struct nfs_client *clp)
 {
        down_read(&clp->cl_sem);
-        if (!list_empty(&clp->cl_superblocks)) {
-                up_read(&clp->cl_sem);
-                return;
-        }
        cancel_delayed_work(&clp->cl_renewd);
        up_read(&clp->cl_sem);
        flush_scheduled_work();
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 090a36b07a22..5fffbdfa971f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -50,149 +50,15 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
+#include "internal.h"
 #define OPENOWNER_POOL_SIZE     8
 const nfs4_stateid zero_stateid;
-static DEFINE_SPINLOCK(state_spinlock);
 static LIST_HEAD(nfs4_clientid_list);
-void
+static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
-init_nfsv4_state(struct nfs_server *server)
-{
-        server->nfs4_state = NULL;
-        INIT_LIST_HEAD(&server->nfs4_siblings);
-}
-void
-destroy_nfsv4_state(struct nfs_server *server)
-{
-        kfree(server->mnt_path);
-        server->mnt_path = NULL;
-        if (server->nfs4_state) {
-                nfs4_put_client(server->nfs4_state);
-                server->nfs4_state = NULL;
-        }
-}
-/*
- * nfs4_get_client(): returns an empty client structure
- * nfs4_put_client(): drops reference to client structure
- *
- * Since these are allocated/deallocated very rarely, we don't
- * bother putting them in a slab cache...
- */
-static struct nfs4_client *
-nfs4_alloc_client(struct in_addr *addr)
-{
-        struct nfs4_client *clp;
-        if (nfs_callback_up() < 0)
-                return NULL;
-        if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
-                nfs_callback_down();
-                return NULL;
-        }
-        memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
-        init_rwsem(&clp->cl_sem);
-        INIT_LIST_HEAD(&clp->cl_delegations);
-        INIT_LIST_HEAD(&clp->cl_state_owners);
-        INIT_LIST_HEAD(&clp->cl_unused);
-        spin_lock_init(&clp->cl_lock);
-        atomic_set(&clp->cl_count, 1);
-        INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
-        INIT_LIST_HEAD(&clp->cl_superblocks);
-        rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
-        clp->cl_rpcclient = ERR_PTR(-EINVAL);
-        clp->cl_boot_time = CURRENT_TIME;
-        clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
-        return clp;
-}
-static void
-nfs4_free_client(struct nfs4_client *clp)
-{
-        struct nfs4_state_owner *sp;
-        while (!list_empty(&clp->cl_unused)) {
-                sp = list_entry(clp->cl_unused.next,
-                                struct nfs4_state_owner,
-                                so_list);
-                list_del(&sp->so_list);
-                kfree(sp);
-        }
-        BUG_ON(!list_empty(&clp->cl_state_owners));
-        nfs_idmap_delete(clp);
-        if (!IS_ERR(clp->cl_rpcclient))
-                rpc_shutdown_client(clp->cl_rpcclient);
-        kfree(clp);
-        nfs_callback_down();
-}
-static struct nfs4_client *__nfs4_find_client(struct in_addr *addr)
-{
-        struct nfs4_client *clp;
-        list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
-                if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) {
-                        atomic_inc(&clp->cl_count);
-                        return clp;
-                }
-        }
-        return NULL;
-}
-struct nfs4_client *nfs4_find_client(struct in_addr *addr)
-{
-        struct nfs4_client *clp;
-        spin_lock(&state_spinlock);
-        clp = __nfs4_find_client(addr);
-        spin_unlock(&state_spinlock);
-        return clp;
-}
-struct nfs4_client *
-nfs4_get_client(struct in_addr *addr)
-{
-        struct nfs4_client *clp, *new = NULL;
-        spin_lock(&state_spinlock);
-        for (;;) {
-                clp = __nfs4_find_client(addr);
-                if (clp != NULL)
-                        break;
-                clp = new;
-                if (clp != NULL) {
-                        list_add(&clp->cl_servers, &nfs4_clientid_list);
-                        new = NULL;
-                        break;
-                }
-                spin_unlock(&state_spinlock);
-                new = nfs4_alloc_client(addr);
-                spin_lock(&state_spinlock);
-                if (new == NULL)
-                        break;
-        }
-        spin_unlock(&state_spinlock);
-        if (new)
-                nfs4_free_client(new);
-        return clp;
-}
-void
-nfs4_put_client(struct nfs4_client *clp)
-{
-        if (!atomic_dec_and_lock(&clp->cl_count, &state_spinlock))
-                return;
-        list_del(&clp->cl_servers);
-        spin_unlock(&state_spinlock);
-        BUG_ON(!list_empty(&clp->cl_superblocks));
-        rpc_wake_up(&clp->cl_rpcwaitq);
-        nfs4_kill_renewd(clp);
-        nfs4_free_client(clp);
-}
-static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred)
 {
        int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
                        nfs_callback_tcpport, cred);
@@ -204,13 +70,13 @@ static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred)
 }
 u32
-nfs4_alloc_lockowner_id(struct nfs4_client *clp)
+nfs4_alloc_lockowner_id(struct nfs_client *clp)
 {
        return clp->cl_lockowner_id ++;
 }
 static struct nfs4_state_owner *
-nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred)
+nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct nfs4_state_owner *sp = NULL;
@@ -224,7 +90,7 @@ nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred)
        return sp;
 }
-struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
+struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rpc_cred *cred = NULL;
@@ -238,7 +104,7 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
        return cred;
 }
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp)
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
@@ -251,7 +117,7 @@ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp)
 }
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred)
+nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct nfs4_state_owner *sp, *res = NULL;
@@ -294,7 +160,7 @@ nfs4_alloc_state_owner(void)
 void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-        struct nfs4_client *clp = sp->so_client;
+        struct nfs_client *clp = sp->so_client;
        spin_lock(&clp->cl_lock);
        list_del_init(&sp->so_list);
        spin_unlock(&clp->cl_lock);
@@ -306,7 +172,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
-        struct nfs4_client *clp = server->nfs4_state;
+        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state_owner *sp, *new;
        get_rpccred(cred);
@@ -337,7 +203,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-        struct nfs4_client *clp = sp->so_client;
+        struct nfs_client *clp = sp->so_client;
        struct rpc_cred *cred = sp->so_cred;
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -540,7 +406,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *lsp;
-        struct nfs4_client *clp = state->owner->so_client;
+        struct nfs_client *clp = state->owner->so_client;
        lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
        if (lsp == NULL)
@@ -752,7 +618,7 @@ out:
 static int reclaimer(void *);
-static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
+static inline void nfs4_clear_recover_bit(struct nfs_client *clp)
 {
        smp_mb__before_clear_bit();
        clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
@@ -764,25 +630,25 @@ static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
 /*
 * State recovery routine
 */
-static void nfs4_recover_state(struct nfs4_client *clp)
+static void nfs4_recover_state(struct nfs_client *clp)
 {
        struct task_struct *task;
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
        task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
-                        NIPQUAD(clp->cl_addr));
+                        NIPQUAD(clp->cl_addr.sin_addr));
        if (!IS_ERR(task))
                return;
        nfs4_clear_recover_bit(clp);
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
        module_put(THIS_MODULE);
 }
 /*
 * Schedule a state recovery attempt
 */
-void nfs4_schedule_state_recovery(struct nfs4_client *clp)
+void nfs4_schedule_state_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
@@ -879,7 +745,7 @@ out_err:
        return status;
 }
-static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
+static void nfs4_state_mark_reclaim(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct nfs4_state *state;
@@ -903,7 +769,7 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
 static int reclaimer(void *ptr)
 {
-        struct nfs4_client *clp = ptr;
+        struct nfs_client *clp = ptr;
        struct nfs4_state_owner *sp;
        struct nfs4_state_recovery_ops *ops;
        struct rpc_cred *cred;
@@ -970,12 +836,12 @@ out:
        if (status == -NFS4ERR_CB_PATH_DOWN)
                nfs_handle_cb_pathdown(clp);
        nfs4_clear_recover_bit(clp);
-        nfs4_put_client(clp);
+        nfs_put_client(clp);
        module_put_and_exit(0);
        return 0;
 out_error:
        printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
-                                NIPQUAD(clp->cl_addr.s_addr), -status);
+                                NIPQUAD(clp->cl_addr.sin_addr), -status);
        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
        goto out;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 730ec8fb31c6..3dd413f52da1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -58,7 +58,7 @@
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO         EIO
-static int nfs_stat_to_errno(int);
+static int nfs4_stat_to_errno(int);
 /* NFSv4 COMPOUND tags are only wanted for debugging purposes */
 #ifdef DEBUG
@@ -128,7 +128,7 @@ static int nfs_stat_to_errno(int);
 #define decode_link_maxsz       (op_decode_hdr_maxsz + 5)
 #define encode_symlink_maxsz    (op_encode_hdr_maxsz + \
                                1 + nfs4_name_maxsz + \
-                                nfs4_path_maxsz + \
+                                1 + \
                                nfs4_fattr_maxsz)
 #define decode_symlink_maxsz    (op_decode_hdr_maxsz + 8)
 #define encode_create_maxsz     (op_encode_hdr_maxsz + \
@@ -529,7 +529,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs4_state, iap->ia_uid, owner_name);
+                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
                if (owner_namelen < 0) {
                        printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n",
                               iap->ia_uid);
@@ -541,7 +541,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs4_state, iap->ia_gid, owner_group);
+                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
                if (owner_grouplen < 0) {
                        printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n",
                               iap->ia_gid);
@@ -673,9 +673,9 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
        switch (create->ftype) {
        case NF4LNK:
-                RESERVE_SPACE(4 + create->u.symlink->len);
+                RESERVE_SPACE(4);
-                WRITE32(create->u.symlink->len);
+                WRITE32(create->u.symlink.len);
-                WRITEMEM(create->u.symlink->name, create->u.symlink->len);
+                xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
                break;
        case NF4BLK: case NF4CHR:
@@ -1160,7 +1160,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
        return 0;
 }
-static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client_stateid)
+static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
 {
        uint32_t *p;
@@ -1246,7 +1246,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
        return 0;
 }
-static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_client *client_state)
+static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
 {
        uint32_t *p;
@@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str
 /*
 * a RENEW request
 */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs4_client *clp)
+static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf
 /*
 * a SETCLIENTID_CONFIRM request
 */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs4_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
@@ -2127,12 +2127,12 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
        }
        READ32(nfserr);
        if (nfserr != NFS_OK)
-                return -nfs_stat_to_errno(nfserr);
+                return -nfs4_stat_to_errno(nfserr);
        return 0;
 }
 /* Dummy routine */
-static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
+static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 {
        uint32_t *p;
        unsigned int strlen;
@@ -2636,7 +2636,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
        return 0;
 }
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_client *clp, int32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
 {
        uint32_t len, *p;
@@ -2660,7 +2660,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
        return 0;
 }
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_client *clp, int32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
 {
        uint32_t len, *p;
@@ -3051,9 +3051,9 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
        fattr->mode |= fmode;
        if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
                goto xdr_error;
-        if ((status = decode_attr_owner(xdr, bitmap, server->nfs4_state, &fattr->uid)) != 0)
+        if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
                goto xdr_error;
-        if ((status = decode_attr_group(xdr, bitmap, server->nfs4_state, &fattr->gid)) != 0)
+        if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
                goto xdr_error;
        if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
                goto xdr_error;
@@ -3254,7 +3254,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
                        if (decode_space_limit(xdr, &res->maxsize) < 0)
                                return -EIO;
        }
-        return decode_ace(xdr, NULL, res->server->nfs4_state);
+        return decode_ace(xdr, NULL, res->server->nfs_client);
 }
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3565,7 +3565,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
        return 0;
 }
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 {
        uint32_t *p;
        uint32_t opnum;
@@ -3598,7 +3598,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_client *clp)
                READ_BUF(len);
                return -NFSERR_CLID_INUSE;
        } else
-                return -nfs_stat_to_errno(nfserr);
+                return -nfs4_stat_to_errno(nfserr);
        return 0;
 }
@@ -4256,7 +4256,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsi
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
        if (!status)
-                status = -nfs_stat_to_errno(hdr.status);
+                status = -nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4335,7 +4335,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
 * a SETCLIENTID request
 */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
-                struct nfs4_client *clp)
+                struct nfs_client *clp)
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr;
@@ -4346,7 +4346,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
        if (!status)
                status = decode_setclientid(&xdr, clp);
        if (!status)
-                status = -nfs_stat_to_errno(hdr.status);
+                status = -nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4368,7 +4368,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
        if (!status)
-                status = -nfs_stat_to_errno(hdr.status);
+                status = -nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4521,7 +4521,7 @@ static struct {
 * This one is used jointly by NFSv2 and NFSv3.
 */
 static int
-nfs_stat_to_errno(int stat)
+nfs4_stat_to_errno(int stat)
 {
        int i;
        for (i = 0; nfs_errtbl[i].stat != -1; i++) {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 36e902a88ca1..829af323f288 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -392,7 +392,6 @@ int __init nfs_init_nfspagecache(void)
 void nfs_destroy_nfspagecache(void)
 {
-        if (kmem_cache_destroy(nfs_page_cachep))
+        kmem_cache_destroy(nfs_page_cachep);
-                printk(KERN_INFO "nfs_page: not all structures were freed\n");
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b3899ea3229e..4529cc4f3f8f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -66,14 +66,14 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
        dprintk("%s: call getattr\n", __FUNCTION__);
        nfs_fattr_init(fattr);
-        status = rpc_call_sync(server->client_sys, &msg, 0);
+        status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
        dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
        if (status)
                return status;
        dprintk("%s: call statfs\n", __FUNCTION__);
        msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
        msg.rpc_resp = &fsinfo;
-        status = rpc_call_sync(server->client_sys, &msg, 0);
+        status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
        dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
        if (status)
                return status;
@@ -352,7 +352,7 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *
 {
        struct nfs_diropargs    *arg;
-        arg = (struct nfs_diropargs *)kmalloc(sizeof(*arg), GFP_KERNEL);
+        arg = kmalloc(sizeof(*arg), GFP_KERNEL);
        if (!arg)
                return -ENOMEM;
        arg->fh = NFS_FH(dir->d_inode);
@@ -425,16 +425,17 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 }
 static int
-nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
-                 struct iattr *sattr, struct nfs_fh *fhandle,
+                 unsigned int len, struct iattr *sattr)
-                 struct nfs_fattr *fattr)
 {
+        struct nfs_fh fhandle;
+        struct nfs_fattr fattr;
        struct nfs_symlinkargs  arg = {
                .fromfh         = NFS_FH(dir),
-                .fromname       = name->name,
+                .fromname       = dentry->d_name.name,
-                .fromlen        = name->len,
+                .fromlen        = dentry->d_name.len,
-                .topath         = path->name,
+                .pages          = &page,
-                .tolen          = path->len,
+                .pathlen        = len,
                .sattr          = sattr
        };
        struct rpc_message msg = {
@@ -443,13 +444,25 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
        };
        int                     status;
-        if (path->len > NFS2_MAXPATHLEN)
+        if (len > NFS2_MAXPATHLEN)
                return -ENAMETOOLONG;
-        dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
-        nfs_fattr_init(fattr);
+        dprintk("NFS call  symlink %s\n", dentry->d_name.name);
-        fhandle->size = 0;
        status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
        nfs_mark_for_revalidate(dir);
+        /*
+         * V2 SYMLINK requests don't return any attributes.  Setting the
+         * filehandle size to zero indicates to nfs_instantiate that it
+         * should fill in the data with a LOOKUP call on the wire.
+         */
+        if (status == 0) {
+                nfs_fattr_init(&fattr);
+                fhandle.size = 0;
+                status = nfs_instantiate(dentry, &fhandle, &fattr);
+        }
        dprintk("NFS reply symlink: %d\n", status);
        return status;
 }
@@ -671,7 +684,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 }
-struct nfs_rpc_ops      nfs_v2_clientops = {
+const struct nfs_rpc_ops nfs_v2_clientops = {
        .version        = 2,                   /* protocol version */
        .dentry_ops     = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs_dir_inode_operations,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f0aff824a291..c2e49c397a27 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -171,7 +171,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
                rdata->args.offset = page_offset(page) + rdata->args.pgbase;
                dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n",
-                        NFS_SERVER(inode)->hostname,
+                        NFS_SERVER(inode)->nfs_client->cl_hostname,
                        inode->i_sb->s_id,
                        (long long)NFS_FILEID(inode),
                        (unsigned long long)rdata->args.pgbase,
@@ -568,8 +568,13 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
        nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count);
-        /* Is this a short read? */
+        if (task->tk_status < 0) {
-        if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) {
+                if (task->tk_status == -ESTALE) {
+                        set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode));
+                        nfs_mark_for_revalidate(data->inode);
+                }
+        } else if (resp->count < argp->count && !resp->eof) {
+                /* This is a short read! */
                nfs_inc_stats(data->inode, NFSIOS_SHORTREAD);
                /* Has the server at least made some progress? */
                if (resp->count != 0) {
@@ -616,6 +621,10 @@ int nfs_readpage(struct file *file, struct page *page)
        if (error)
                goto out_error;
+        error = -ESTALE;
+        if (NFS_STALE(inode))
+                goto out_error;
        if (file == NULL) {
                ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
                if (ctx == NULL)
@@ -678,7 +687,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        };
        struct inode *inode = mapping->host;
        struct nfs_server *server = NFS_SERVER(inode);
-        int ret;
+        int ret = -ESTALE;
        dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
                        inode->i_sb->s_id,
@@ -686,6 +695,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                        nr_pages);
        nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
+        if (NFS_STALE(inode))
+                goto out;
        if (filp == NULL) {
                desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
                if (desc.ctx == NULL)
@@ -701,6 +713,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                        ret = err;
        }
        put_nfs_open_context(desc.ctx);
+out:
        return ret;
 }
@@ -724,6 +737,5 @@ int __init nfs_init_readpagecache(void)
 void nfs_destroy_readpagecache(void)
 {
        mempool_destroy(nfs_rdata_mempool);
-        if (kmem_cache_destroy(nfs_rdata_cachep))
+        kmem_cache_destroy(nfs_rdata_cachep);
-                printk(KERN_INFO "nfs_read_data: not all structures were freed\n");
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e8a9bee74d9d..e8d40030cab4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -13,6 +13,11 @@
 *
 *  Split from inode.c by David Howells <dhowells@redhat.com>
 *
+ * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a
+ *   particular server are held in the same superblock
+ * - NFS superblocks can have several effective roots to the dentry tree
+ * - directory type roots are spliced into the tree when a path from one root reaches the root
+ *   of another (see nfs_lookup())
 */
 #include <linux/config.h>
@@ -52,66 +57,12 @@
 #define NFSDBG_FACILITY         NFSDBG_VFS
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD       (RPC_DEF_SLOT_TABLE - 1)
-/*
- * RPC cruft for NFS
- */
-static struct rpc_version * nfs_version[] = {
-        NULL,
-        NULL,
-        &nfs_version2,
-#if defined(CONFIG_NFS_V3)
-        &nfs_version3,
-#elif defined(CONFIG_NFS_V4)
-        NULL,
-#endif
-#if defined(CONFIG_NFS_V4)
-        &nfs_version4,
-#endif
-};
-static struct rpc_program nfs_program = {
-        .name                   = "nfs",
-        .number                 = NFS_PROGRAM,
-        .nrvers                 = ARRAY_SIZE(nfs_version),
-        .version                = nfs_version,
-        .stats                  = &nfs_rpcstat,
-        .pipe_dir_name          = "/nfs",
-};
-struct rpc_stat nfs_rpcstat = {
-        .program                = &nfs_program
-};
-#ifdef CONFIG_NFS_V3_ACL
-static struct rpc_stat          nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *     nfsacl_version[] = {
-        [3]                     = &nfsacl_version3,
-};
-struct rpc_program              nfsacl_program = {
-        .name =                 "nfsacl",
-        .number =               NFS_ACL_PROGRAM,
-        .nrvers =               ARRAY_SIZE(nfsacl_version),
-        .version =              nfsacl_version,
-        .stats =                &nfsacl_rpcstat,
-};
-#endif  /* CONFIG_NFS_V3_ACL */
 static void nfs_umount_begin(struct vfsmount *, int);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+static int nfs_xdev_get_sb(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
@@ -120,15 +71,15 @@ static struct file_system_type nfs_fs_type = {
        .name           = "nfs",
        .get_sb         = nfs_get_sb,
        .kill_sb        = nfs_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
-struct file_system_type clone_nfs_fs_type = {
+struct file_system_type nfs_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_clone_nfs_sb,
+        .get_sb         = nfs_xdev_get_sb,
        .kill_sb        = nfs_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 static struct super_operations nfs_sops = {
@@ -145,10 +96,10 @@ static struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+static int nfs4_referral_get_sb(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
@@ -156,23 +107,23 @@ static struct file_system_type nfs4_fs_type = {
        .name           = "nfs4",
        .get_sb         = nfs4_get_sb,
        .kill_sb        = nfs4_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
-struct file_system_type clone_nfs4_fs_type = {
+struct file_system_type nfs4_xdev_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs_clone_nfs4_sb,
+        .get_sb         = nfs4_xdev_get_sb,
        .kill_sb        = nfs4_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
-struct file_system_type nfs_referral_nfs4_fs_type = {
+struct file_system_type nfs4_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs_referral_nfs4_sb,
+        .get_sb         = nfs4_referral_get_sb,
        .kill_sb        = nfs4_kill_super,
-        .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 static struct super_operations nfs4_sops = {
@@ -187,39 +138,7 @@ static struct super_operations nfs4_sops = {
 };
 #endif
-#ifdef CONFIG_NFS_V4
+static struct shrinker *acl_shrinker;
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
-static int param_set_port(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
-                return -EINVAL;
-        *((int *)kp->arg) = num;
-        return 0;
-}
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-                 &nfs_callback_set_tcpport, 0644);
-#endif
-#ifdef CONFIG_NFS_V4
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
-        char *endp;
-        int num = simple_strtol(val, &endp, 0);
-        int jif = num * HZ;
-        if (endp == val || *endp || num < 0 || jif < num)
-                return -EINVAL;
-        *((int *)kp->arg) = jif;
-        return 0;
-}
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
-                 &nfs_idmap_cache_timeout, 0644);
-#endif
 /*
 * Register the NFS filesystems
@@ -240,6 +159,7 @@ int __init register_nfs_fs(void)
        if (ret < 0)
                goto error_2;
 #endif
+        acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker);
        return 0;
 #ifdef CONFIG_NFS_V4
@@ -257,6 +177,8 @@ error_0:
 */
 void __exit unregister_nfs_fs(void)
 {
+        if (acl_shrinker != NULL)
+                remove_shrinker(acl_shrinker);
 #ifdef CONFIG_NFS_V4
        unregister_filesystem(&nfs4_fs_type);
        nfs_unregister_sysctl();
@@ -269,11 +191,10 @@ void __exit unregister_nfs_fs(void)
 */
 static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct super_block *sb = dentry->d_sb;
+        struct nfs_server *server = NFS_SB(dentry->d_sb);
-        struct nfs_server *server = NFS_SB(sb);
        unsigned char blockbits;
        unsigned long blockres;
-        struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
+        struct nfs_fh *fh = NFS_FH(dentry->d_inode);
        struct nfs_fattr fattr;
        struct nfs_fsstat res = {
                        .fattr = &fattr,
@@ -282,7 +203,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        lock_kernel();
-        error = server->rpc_ops->statfs(server, rootfh, &res);
+        error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
        buf->f_type = NFS_SUPER_MAGIC;
        if (error < 0)
                goto out_err;
@@ -292,7 +213,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
         * case where f_frsize != f_bsize.  Eventually we want to
         * report the value of wtmult in this field.
         */
-        buf->f_frsize = sb->s_blocksize;
+        buf->f_frsize = dentry->d_sb->s_blocksize;
        /*
         * On most *nix systems, f_blocks, f_bfree, and f_bavail
@@ -301,8 +222,8 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
         * thus historically Linux's sys_statfs reports these
         * fields in units of f_bsize.
         */
-        buf->f_bsize = sb->s_blocksize;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
-        blockbits = sb->s_blocksize_bits;
+        blockbits = dentry->d_sb->s_blocksize_bits;
        blockres = (1 << blockbits) - 1;
        buf->f_blocks = (res.tbytes + blockres) >> blockbits;
        buf->f_bfree = (res.fbytes + blockres) >> blockbits;
@@ -323,9 +244,12 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
+/*
+ * Map the security flavour number to a name
+ */
 static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 {
-        static struct {
+        static const struct {
                rpc_authflavor_t flavour;
                const char *str;
        } sec_flavours[] = {
@@ -356,10 +280,10 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
 */
 static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
 {
-        static struct proc_nfs_info {
+        static const struct proc_nfs_info {
                int flag;
-                char *str;
+                const char *str;
-                char *nostr;
+                const char *nostr;
        } nfs_info[] = {
                { NFS_MOUNT_SOFT, ",soft", ",hard" },
                { NFS_MOUNT_INTR, ",intr", "" },
@@ -369,11 +293,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                { NFS_MOUNT_NOACL, ",noacl", "" },
                { 0, NULL, NULL }
        };
-        struct proc_nfs_info *nfs_infop;
+        const struct proc_nfs_info *nfs_infop;
+        struct nfs_client *clp = nfss->nfs_client;
        char buf[12];
-        char *proto;
+        const char *proto;
-        seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
+        seq_printf(m, ",vers=%d", clp->rpc_ops->version);
        seq_printf(m, ",rsize=%d", nfss->rsize);
        seq_printf(m, ",wsize=%d", nfss->wsize);
        if (nfss->acregmin != 3*HZ || showdefaults)
@@ -402,8 +327,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                        proto = buf;
        }
        seq_printf(m, ",proto=%s", proto);
-        seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
+        seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
-        seq_printf(m, ",retrans=%u", nfss->retrans_count);
+        seq_printf(m, ",retrans=%u", clp->retrans_count);
        seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
 }
@@ -417,7 +342,7 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        nfs_show_mount_options(m, nfss, 0);
        seq_puts(m, ",addr=");
-        seq_escape(m, nfss->hostname, " \t\n\\");
+        seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\");
        return 0;
 }
@@ -454,7 +379,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, ",namelen=%d", nfss->namelen);
 #ifdef CONFIG_NFS_V4
-        if (nfss->rpc_ops->version == 4) {
+        if (nfss->nfs_client->cl_nfsversion == 4) {
                seq_printf(m, "\n\tnfsv4:\t");
                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
@@ -501,782 +426,353 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 /*
 * Begin unmount by attempting to remove all automounted mountpoints we added
- * in response to traversals
+ * in response to xdev traversals and referrals
 */
 static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-        struct nfs_server *server;
-        struct rpc_clnt *rpc;
        shrink_submounts(vfsmnt, &nfs_automount_list);
-        if (!(flags & MNT_FORCE))
-                return;
-        /* -EIO all pending I/O */
-        server = NFS_SB(vfsmnt->mnt_sb);
-        rpc = server->client;
-        if (!IS_ERR(rpc))
-                rpc_killall_tasks(rpc);
-        rpc = server->client_acl;
-        if (!IS_ERR(rpc))
-                rpc_killall_tasks(rpc);
 }
 /*
- * Obtain the root inode of the file system.
+ * Validate the NFS2/NFS3 mount data
+ * - fills in the mount root filehandle
 */
-static struct inode *
+static int nfs_validate_mount_data(struct nfs_mount_data *data,
-nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
+                                   struct nfs_fh *mntfh)
 {
-        struct nfs_server       *server = NFS_SB(sb);
+        if (data == NULL) {
-        int                     error;
+                dprintk("%s: missing data argument\n", __FUNCTION__);
+                return -EINVAL;
-        error = server->rpc_ops->getroot(server, rootfh, fsinfo);
-        if (error < 0) {
-                dprintk("nfs_get_root: getattr error = %d\n", -error);
-                return ERR_PTR(error);
        }
-        server->fsid = fsinfo->fattr->fsid;
+        if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-        return nfs_fhget(sb, rootfh, fsinfo->fattr);
+                dprintk("%s: bad mount version\n", __FUNCTION__);
-}
+                return -EINVAL;
+        }
-/*
- * Do NFS version-independent mount processing, and sanity checking
- */
-static int
-nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
-{
-        struct nfs_server       *server;
-        struct inode            *root_inode;
-        struct nfs_fattr        fattr;
-        struct nfs_fsinfo       fsinfo = {
-                                        .fattr = &fattr,
-                                };
-        struct nfs_pathconf pathinfo = {
-                        .fattr = &fattr,
-        };
-        int no_root_error = 0;
-        unsigned long max_rpc_payload;
-        /* We probably want something more informative here */
-        snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
-        server = NFS_SB(sb);
-        sb->s_magic      = NFS_SUPER_MAGIC;
+        switch (data->version) {
+                case 1:
+                        data->namlen = 0;
+                case 2:
+                        data->bsize  = 0;
+                case 3:
+                        if (data->flags & NFS_MOUNT_VER3) {
+                                dprintk("%s: mount structure version %d does not support NFSv3\n",
+                                                __FUNCTION__,
+                                                data->version);
+                                return -EINVAL;
+                        }
+                        data->root.size = NFS2_FHSIZE;
+                        memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+                case 4:
+                        if (data->flags & NFS_MOUNT_SECFLAVOUR) {
+                                dprintk("%s: mount structure version %d does not support strong security\n",
+                                                __FUNCTION__,
+                                                data->version);
+                                return -EINVAL;
+                        }
+                case 5:
+                        memset(data->context, 0, sizeof(data->context));
+        }
-        server->io_stats = nfs_alloc_iostats();
+        /* Set the pseudoflavor */
-        if (server->io_stats == NULL)
+        if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-                return -ENOMEM;
+                data->pseudoflavor = RPC_AUTH_UNIX;
-        root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
+#ifndef CONFIG_NFS_V3
-        /* Did getting the root inode fail? */
+        /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-        if (IS_ERR(root_inode)) {
+        if (data->flags & NFS_MOUNT_VER3) {
-                no_root_error = PTR_ERR(root_inode);
+                dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-                goto out_no_root;
+                return -EPROTONOSUPPORT;
-        }
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root) {
-                no_root_error = -ENOMEM;
-                goto out_no_root;
        }
-        sb->s_root->d_op = server->rpc_ops->dentry_ops;
+#endif /* CONFIG_NFS_V3 */
-        /* mount time stamp, in seconds */
-        server->mount_time = jiffies;
-        /* Get some general file system info */
-        if (server->namelen == 0 &&
-            server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
-                server->namelen = pathinfo.max_namelen;
-        /* Work out a lot of parameters */
-        if (server->rsize == 0)
-                server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
-        if (server->wsize == 0)
-                server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
-        if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
-                server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
-        if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
-                server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
-        max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
-        if (server->rsize > max_rpc_payload)
-                server->rsize = max_rpc_payload;
-        if (server->rsize > NFS_MAX_FILE_IO_SIZE)
-                server->rsize = NFS_MAX_FILE_IO_SIZE;
-        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (server->wsize > max_rpc_payload)
-                server->wsize = max_rpc_payload;
-        if (server->wsize > NFS_MAX_FILE_IO_SIZE)
-                server->wsize = NFS_MAX_FILE_IO_SIZE;
-        server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        if (sb->s_blocksize == 0)
+        /* We now require that the mount process passes the remote address */
-                sb->s_blocksize = nfs_block_bits(server->wsize,
+        if (data->addr.sin_addr.s_addr == INADDR_ANY) {
-                                                         &sb->s_blocksize_bits);
+                dprintk("%s: mount program didn't pass remote address!\n",
-        server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
+                        __FUNCTION__);
+                return -EINVAL;
-        server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
-        if (server->dtsize > PAGE_CACHE_SIZE)
-                server->dtsize = PAGE_CACHE_SIZE;
-        if (server->dtsize > server->rsize)
-                server->dtsize = server->rsize;
-        if (server->flags & NFS_MOUNT_NOAC) {
-                server->acregmin = server->acregmax = 0;
-                server->acdirmin = server->acdirmax = 0;
-                sb->s_flags |= MS_SYNCHRONOUS;
        }
-        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-        nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+        /* Prepare the root filehandle */
+        if (data->flags & NFS_MOUNT_VER3)
+                mntfh->size = data->root.size;
+        else
+                mntfh->size = NFS2_FHSIZE;
+        if (mntfh->size > sizeof(mntfh->data)) {
+                dprintk("%s: invalid root filehandle\n", __FUNCTION__);
+                return -EINVAL;
+        }
-        server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
+        memcpy(mntfh->data, data->root.data, mntfh->size);
-        server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
+        if (mntfh->size < sizeof(mntfh->data))
+                memset(mntfh->data + mntfh->size, 0,
+                       sizeof(mntfh->data) - mntfh->size);
-        /* We're airborne Set socket buffersize */
-        rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
        return 0;
-        /* Yargs. It didn't work out. */
-out_no_root:
-        dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
-        if (!IS_ERR(root_inode))
-                iput(root_inode);
-        return no_root_error;
 }
 /*
- * Initialise the timeout values for a connection
+ * Initialise the common bits of the superblock
 */
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
+static inline void nfs_initialise_sb(struct super_block *sb)
 {
-        to->to_initval = timeo * HZ / 10;
+        struct nfs_server *server = NFS_SB(sb);
-        to->to_retries = retrans;
-        if (!to->to_retries)
-                to->to_retries = 2;
-        switch (proto) {
-        case IPPROTO_TCP:
-                if (!to->to_initval)
-                        to->to_initval = 60 * HZ;
-                if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
-                        to->to_initval = NFS_MAX_TCP_TIMEOUT;
-                to->to_increment = to->to_initval;
-                to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
-                to->to_exponential = 0;
-                break;
-        case IPPROTO_UDP:
-        default:
-                if (!to->to_initval)
-                        to->to_initval = 11 * HZ / 10;
-                if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
-                        to->to_initval = NFS_MAX_UDP_TIMEOUT;
-                to->to_maxval = NFS_MAX_UDP_TIMEOUT;
-                to->to_exponential = 1;
-                break;
-        }
-}
-/*
+        sb->s_magic = NFS_SUPER_MAGIC;
- * Create an RPC client handle.
- */
-static struct rpc_clnt *
-nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
-{
-        struct rpc_timeout      timeparms;
-        struct rpc_xprt         *xprt = NULL;
-        struct rpc_clnt         *clnt = NULL;
-        int                     proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-        nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
-        server->retrans_timeo = timeparms.to_initval;
-        server->retrans_count = timeparms.to_retries;
-        /* create transport and client */
-        xprt = xprt_create_proto(proto, &server->addr, &timeparms);
-        if (IS_ERR(xprt)) {
-                dprintk("%s: cannot create RPC transport. Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(xprt));
-                return (struct rpc_clnt *)xprt;
-        }
-        clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-                                 server->rpc_ops->version, data->pseudoflavor);
-        if (IS_ERR(clnt)) {
-                dprintk("%s: cannot create RPC client. Error = %ld\n",
-                                __FUNCTION__, PTR_ERR(xprt));
-                goto out_fail;
-        }
-        clnt->cl_intr     = 1;
+        /* We probably want something more informative here */
-        clnt->cl_softrtry = 1;
+        snprintf(sb->s_id, sizeof(sb->s_id),
+                 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+        if (sb->s_blocksize == 0)
+                sb->s_blocksize = nfs_block_bits(server->wsize,
+                                                 &sb->s_blocksize_bits);
-        return clnt;
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb->s_flags |= MS_SYNCHRONOUS;
-out_fail:
+        nfs_super_set_maxbytes(sb, server->maxfilesize);
-        return clnt;
 }
 /*
- * Clone a server record
+ * Finish setting up an NFS2/3 superblock
 */
-static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data)
 {
        struct nfs_server *server = NFS_SB(sb);
-        struct nfs_server *parent = NFS_SB(data->sb);
-        struct inode *root_inode;
-        struct nfs_fsinfo fsinfo;
-        void *err = ERR_PTR(-ENOMEM);
-        sb->s_op = data->sb->s_op;
-        sb->s_blocksize = data->sb->s_blocksize;
-        sb->s_blocksize_bits = data->sb->s_blocksize_bits;
-        sb->s_maxbytes = data->sb->s_maxbytes;
-        server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-        server->io_stats = nfs_alloc_iostats();
-        if (server->io_stats == NULL)
-                goto out;
-        server->client = rpc_clone_client(parent->client);
-        if (IS_ERR((err = server->client)))
-                goto out;
-        if (!IS_ERR(parent->client_sys)) {
-                server->client_sys = rpc_clone_client(parent->client_sys);
-                if (IS_ERR((err = server->client_sys)))
-                        goto out;
-        }
-        if (!IS_ERR(parent->client_acl)) {
-                server->client_acl = rpc_clone_client(parent->client_acl);
-                if (IS_ERR((err = server->client_acl)))
-                        goto out;
-        }
-        root_inode = nfs_fhget(sb, data->fh, data->fattr);
-        if (!root_inode)
-                goto out;
-        sb->s_root = d_alloc_root(root_inode);
-        if (!sb->s_root)
-                goto out_put_root;
-        fsinfo.fattr = data->fattr;
-        if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
-                nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
-        sb->s_root->d_op = server->rpc_ops->dentry_ops;
-        sb->s_flags |= MS_ACTIVE;
-        return server;
-out_put_root:
-        iput(root_inode);
-out:
-        return err;
-}
-/*
+        sb->s_blocksize_bits = 0;
- * Copy an existing superblock and attach revised data
+        sb->s_blocksize = 0;
- */
+        if (data->bsize)
-static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
+                sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-                struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
-                struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *),
-                struct vfsmount *mnt)
-{
-        struct nfs_server *server;
-        struct nfs_server *parent = NFS_SB(data->sb);
-        struct super_block *sb = ERR_PTR(-EINVAL);
-        char *hostname;
-        int error = -ENOMEM;
-        int len;
-        server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
-        if (server == NULL)
-                goto out_err;
-        memcpy(server, parent, sizeof(*server));
-        hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
-        len = strlen(hostname) + 1;
-        server->hostname = kmalloc(len, GFP_KERNEL);
-        if (server->hostname == NULL)
-                goto free_server;
-        memcpy(server->hostname, hostname, len);
-        error = rpciod_up();
-        if (error != 0)
-                goto free_hostname;
-        sb = fill_sb(server, data);
-        if (IS_ERR(sb)) {
-                error = PTR_ERR(sb);
-                goto kill_rpciod;
-        }
-                
-        if (sb->s_root)
-                goto out_rpciod_down;
-        server = fill_server(sb, data);
+        if (server->flags & NFS_MOUNT_VER3) {
-        if (IS_ERR(server)) {
+                /* The VFS shouldn't apply the umask to mode bits. We will do
-                error = PTR_ERR(server);
+                 * so ourselves when necessary.
-                goto out_deactivate;
+                 */
+                sb->s_flags |= MS_POSIXACL;
+                sb->s_time_gran = 1;
        }
-        return simple_set_mnt(mnt, sb);
-out_deactivate:
+        sb->s_op = &nfs_sops;
-        up_write(&sb->s_umount);
+        nfs_initialise_sb(sb);
-        deactivate_super(sb);
-        return error;
-out_rpciod_down:
-        rpciod_down();
-        kfree(server->hostname);
-        kfree(server);
-        return simple_set_mnt(mnt, sb);
-kill_rpciod:
-        rpciod_down();
-free_hostname:
-        kfree(server->hostname);
-free_server:
-        kfree(server);
-out_err:
-        return error;
 }
 /*
- * Set up an NFS2/3 superblock
+ * Finish setting up a cloned NFS2/3 superblock
- *
- * The way this works is that the mount process passes a structure
- * in the data argument which contains the server's IP address
- * and the root file handle obtained from the server's mount
- * daemon. We stash these away in the private superblock fields.
 */
-static int
+static void nfs_clone_super(struct super_block *sb,
-nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
+                            const struct super_block *old_sb)
 {
-        struct nfs_server       *server;
+        struct nfs_server *server = NFS_SB(sb);
-        rpc_authflavor_t        authflavor;
-        server           = NFS_SB(sb);
+        sb->s_blocksize_bits = old_sb->s_blocksize_bits;
-        sb->s_blocksize_bits = 0;
+        sb->s_blocksize = old_sb->s_blocksize;
-        sb->s_blocksize = 0;
+        sb->s_maxbytes = old_sb->s_maxbytes;
-        if (data->bsize)
-                sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-        if (data->rsize)
-                server->rsize = nfs_block_size(data->rsize, NULL);
-        if (data->wsize)
-                server->wsize = nfs_block_size(data->wsize, NULL);
-        server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
-        server->acregmin = data->acregmin*HZ;
-        server->acregmax = data->acregmax*HZ;
-        server->acdirmin = data->acdirmin*HZ;
-        server->acdirmax = data->acdirmax*HZ;
-        /* Start lockd here, before we might error out */
-        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_up();
-        server->namelen  = data->namlen;
-        server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
-        if (!server->hostname)
-                return -ENOMEM;
-        strcpy(server->hostname, data->hostname);
-        /* Check NFS protocol revision and initialize RPC op vector
-         * and file handle pool. */
-#ifdef CONFIG_NFS_V3
-        if (server->flags & NFS_MOUNT_VER3) {
-                server->rpc_ops = &nfs_v3_clientops;
-                server->caps |= NFS_CAP_READDIRPLUS;
-        } else {
-                server->rpc_ops = &nfs_v2_clientops;
-        }
-#else
-        server->rpc_ops = &nfs_v2_clientops;
-#endif
-        /* Fill in pseudoflavor for mount version < 5 */
-        if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-                data->pseudoflavor = RPC_AUTH_UNIX;
-        authflavor = data->pseudoflavor;        /* save for sb_init() */
-        /* XXX maybe we want to add a server->pseudoflavor field */
-        /* Create RPC client handles */
-        server->client = nfs_create_client(server, data);
-        if (IS_ERR(server->client))
-                return PTR_ERR(server->client);
-        /* RFC 2623, sec 2.3.2 */
-        if (authflavor != RPC_AUTH_UNIX) {
-                struct rpc_auth *auth;
-                server->client_sys = rpc_clone_client(server->client);
-                if (IS_ERR(server->client_sys))
-                        return PTR_ERR(server->client_sys);
-                auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
-                if (IS_ERR(auth))
-                        return PTR_ERR(auth);
-        } else {
-                atomic_inc(&server->client->cl_count);
-                server->client_sys = server->client;
-        }
        if (server->flags & NFS_MOUNT_VER3) {
-#ifdef CONFIG_NFS_V3_ACL
+                /* The VFS shouldn't apply the umask to mode bits. We will do
-                if (!(server->flags & NFS_MOUNT_NOACL)) {
+                 * so ourselves when necessary.
-                        server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
-                        /* No errors! Assume that Sun nfsacls are supported */
-                        if (!IS_ERR(server->client_acl))
-                                server->caps |= NFS_CAP_ACLS;
-                }
-#else
-                server->flags &= ~NFS_MOUNT_NOACL;
-#endif /* CONFIG_NFS_V3_ACL */
-                /*
-                 * The VFS shouldn't apply the umask to mode bits. We will
-                 * do so ourselves when necessary.
                 */
                sb->s_flags |= MS_POSIXACL;
-                if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-                        server->namelen = NFS3_MAXNAMLEN;
                sb->s_time_gran = 1;
-        } else {
-                if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-                        server->namelen = NFS2_MAXNAMLEN;
        }
-        sb->s_op = &nfs_sops;
+        sb->s_op = old_sb->s_op;
-        return nfs_sb_init(sb, authflavor);
+        nfs_initialise_sb(sb);
 }
-static int nfs_set_super(struct super_block *s, void *data)
+static int nfs_set_super(struct super_block *s, void *_server)
 {
-        s->s_fs_info = data;
+        struct nfs_server *server = _server;
-        return set_anon_super(s, data);
+        int ret;
+        s->s_fs_info = server;
+        ret = set_anon_super(s, server);
+        if (ret == 0)
+                server->s_dev = s->s_dev;
+        return ret;
 }
 static int nfs_compare_super(struct super_block *sb, void *data)
 {
-        struct nfs_server *server = data;
+        struct nfs_server *server = data, *old = NFS_SB(sb);
-        struct nfs_server *old = NFS_SB(sb);
-        if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
+        if (old->nfs_client != server->nfs_client)
                return 0;
-        if (old->addr.sin_port != server->addr.sin_port)
+        if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
                return 0;
-        return !nfs_compare_fh(&old->fh, &server->fh);
+        return 1;
 }
 static int nfs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-        int error;
        struct nfs_server *server = NULL;
        struct super_block *s;
-        struct nfs_fh *root;
+        struct nfs_fh mntfh;
        struct nfs_mount_data *data = raw_data;
+        struct dentry *mntroot;
+        int error;
-        error = -EINVAL;
+        /* Validate the mount data */
-        if (data == NULL) {
+        error = nfs_validate_mount_data(data, &mntfh);
-                dprintk("%s: missing data argument\n", __FUNCTION__);
+        if (error < 0)
-                goto out_err_noserver;
+                return error;
-        }
-        if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-                dprintk("%s: bad mount version\n", __FUNCTION__);
-                goto out_err_noserver;
-        }
-        switch (data->version) {
-                case 1:
-                        data->namlen = 0;
-                case 2:
-                        data->bsize  = 0;
-                case 3:
-                        if (data->flags & NFS_MOUNT_VER3) {
-                                dprintk("%s: mount structure version %d does not support NFSv3\n",
-                                                __FUNCTION__,
-                                                data->version);
-                                goto out_err_noserver;
-                        }
-                        data->root.size = NFS2_FHSIZE;
-                        memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
-                case 4:
-                        if (data->flags & NFS_MOUNT_SECFLAVOUR) {
-                                dprintk("%s: mount structure version %d does not support strong security\n",
-                                                __FUNCTION__,
-                                                data->version);
-                                goto out_err_noserver;
-                        }
-                case 5:
-                        memset(data->context, 0, sizeof(data->context));
-        }
-#ifndef CONFIG_NFS_V3
-        /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-        error = -EPROTONOSUPPORT;
-        if (data->flags & NFS_MOUNT_VER3) {
-                dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-                goto out_err_noserver;
-        }
-#endif /* CONFIG_NFS_V3 */
-        error = -ENOMEM;
+        /* Get a volume representation */
-        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        server = nfs_create_server(data, &mntfh);
-        if (!server)
+        if (IS_ERR(server)) {
+                error = PTR_ERR(server);
                goto out_err_noserver;
-        /* Zero out the NFS state stuff */
-        init_nfsv4_state(server);
-        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-        root = &server->fh;
-        if (data->flags & NFS_MOUNT_VER3)
-                root->size = data->root.size;
-        else
-                root->size = NFS2_FHSIZE;
-        error = -EINVAL;
-        if (root->size > sizeof(root->data)) {
-                dprintk("%s: invalid root filehandle\n", __FUNCTION__);
-                goto out_err;
-        }
-        memcpy(root->data, data->root.data, root->size);
-        /* We now require that the mount process passes the remote address */
-        memcpy(&server->addr, &data->addr, sizeof(server->addr));
-        if (server->addr.sin_addr.s_addr == INADDR_ANY) {
-                dprintk("%s: mount program didn't pass remote address!\n",
-                                __FUNCTION__);
-                goto out_err;
-        }
-        /* Fire up rpciod if not yet running */
-        error = rpciod_up();
-        if (error < 0) {
-                dprintk("%s: couldn't start rpciod! Error = %d\n",
-                                __FUNCTION__, error);
-                goto out_err;
        }
+        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
-                goto out_err_rpciod;
+                goto out_err_nosb;
        }
-        if (s->s_root)
+        if (s->s_fs_info != server) {
-                goto out_rpciod_down;
+                nfs_free_server(server);
+                server = NULL;
+        }
-        s->s_flags = flags;
+        if (!s->s_root) {
+                /* initial superblock/root creation */
+                s->s_flags = flags;
+                nfs_fill_super(s, data);
+        }
-        error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+        mntroot = nfs_get_root(s, &mntfh);
-        if (error) {
+        if (IS_ERR(mntroot)) {
-                up_write(&s->s_umount);
+                error = PTR_ERR(mntroot);
-                deactivate_super(s);
+                goto error_splat_super;
-                return error;
        }
-        s->s_flags |= MS_ACTIVE;
-        return simple_set_mnt(mnt, s);
-out_rpciod_down:
+        s->s_flags |= MS_ACTIVE;
-        rpciod_down();
+        mnt->mnt_sb = s;
-        kfree(server);
+        mnt->mnt_root = mntroot;
-        return simple_set_mnt(mnt, s);
+        return 0;
-out_err_rpciod:
+out_err_nosb:
-        rpciod_down();
+        nfs_free_server(server);
-out_err:
-        kfree(server);
 out_err_noserver:
        return error;
+error_splat_super:
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        return error;
 }
+/*
+ * Destroy an NFS2/3 superblock
+ */
 static void nfs_kill_super(struct super_block *s)
 {
        struct nfs_server *server = NFS_SB(s);
        kill_anon_super(s);
+        nfs_free_server(server);
-        if (!IS_ERR(server->client))
-                rpc_shutdown_client(server->client);
-        if (!IS_ERR(server->client_sys))
-                rpc_shutdown_client(server->client_sys);
-        if (!IS_ERR(server->client_acl))
-                rpc_shutdown_client(server->client_acl);
-        if (!(server->flags & NFS_MOUNT_NONLM))
-                lockd_down();   /* release rpc.lockd */
-        rpciod_down();          /* release rpciod */
-        nfs_free_iostats(server->io_stats);
-        kfree(server->hostname);
-        kfree(server);
-        nfs_release_automount_timer();
-}
-static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-        struct super_block *sb;
-        server->fsid = data->fattr->fsid;
-        nfs_copy_fh(&server->fh, data->fh);
-        sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
-        if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
-                lockd_up();
-        return sb;
 }
-static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+/*
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+ * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ */
+static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
+                           const char *dev_name, void *raw_data,
+                           struct vfsmount *mnt)
 {
        struct nfs_clone_mount *data = raw_data;
-        return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt);
+        struct super_block *s;
-}
+        struct nfs_server *server;
+        struct dentry *mntroot;
+        int error;
-#ifdef CONFIG_NFS_V4
+        dprintk("--> nfs_xdev_get_sb()\n");
-static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
-        struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
-{
-        struct nfs4_client *clp;
-        struct rpc_xprt *xprt = NULL;
-        struct rpc_clnt *clnt = NULL;
-        int err = -EIO;
-        clp = nfs4_get_client(&server->addr.sin_addr);
-        if (!clp) {
-                dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-                return ERR_PTR(err);
-        }
-        /* Now create transport and client */
+        /* create a new volume representation */
-        down_write(&clp->cl_sem);
+        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
-        if (IS_ERR(clp->cl_rpcclient)) {
+        if (IS_ERR(server)) {
-                xprt = xprt_create_proto(proto, &server->addr, timeparms);
+                error = PTR_ERR(server);
-                if (IS_ERR(xprt)) {
+                goto out_err_noserver;
-                        up_write(&clp->cl_sem);
-                        err = PTR_ERR(xprt);
-                        dprintk("%s: cannot create RPC transport. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-                }
-                /* Bind to a reserved port! */
-                xprt->resvport = 1;
-                clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-                                server->rpc_ops->version, flavor);
-                if (IS_ERR(clnt)) {
-                        up_write(&clp->cl_sem);
-                        err = PTR_ERR(clnt);
-                        dprintk("%s: cannot create RPC client. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-                }
-                clnt->cl_intr     = 1;
-                clnt->cl_softrtry = 1;
-                clp->cl_rpcclient = clnt;
-                memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-                nfs_idmap_new(clp);
-        }
-        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-        clnt = rpc_clone_client(clp->cl_rpcclient);
-        if (!IS_ERR(clnt))
-                server->nfs4_state = clp;
-        up_write(&clp->cl_sem);
-        clp = NULL;
-        if (IS_ERR(clnt)) {
-                dprintk("%s: cannot create RPC client. Error = %d\n",
-                                __FUNCTION__, err);
-                return clnt;
        }
-        if (server->nfs4_state->cl_idmap == NULL) {
+        /* Get a superblock - note that we may end up sharing one that already exists */
-                dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
+        s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
-                return ERR_PTR(-ENOMEM);
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto out_err_nosb;
        }
-        if (clnt->cl_auth->au_flavor != flavor) {
+        if (s->s_fs_info != server) {
-                struct rpc_auth *auth;
+                nfs_free_server(server);
+                server = NULL;
-                auth = rpcauth_create(flavor, clnt);
-                if (IS_ERR(auth)) {
-                        dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-                        return (struct rpc_clnt *)auth;
-                }
        }
-        return clnt;
- out_fail:
-        if (clp)
-                nfs4_put_client(clp);
-        return ERR_PTR(err);
-}
-/*
- * Set up an NFS4 superblock
- */
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
-{
-        struct nfs_server *server;
-        struct rpc_timeout timeparms;
-        rpc_authflavor_t authflavour;
-        int err = -EIO;
-        sb->s_blocksize_bits = 0;
+        if (!s->s_root) {
-        sb->s_blocksize = 0;
+                /* initial superblock/root creation */
-        server = NFS_SB(sb);
+                s->s_flags = flags;
-        if (data->rsize != 0)
+                nfs_clone_super(s, data->sb);
-                server->rsize = nfs_block_size(data->rsize, NULL);
+        }
-        if (data->wsize != 0)
-                server->wsize = nfs_block_size(data->wsize, NULL);
-        server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-        server->caps = NFS_CAP_ATOMIC_OPEN;
-        server->acregmin = data->acregmin*HZ;
+        mntroot = nfs_get_root(s, data->fh);
-        server->acregmax = data->acregmax*HZ;
+        if (IS_ERR(mntroot)) {
-        server->acdirmin = data->acdirmin*HZ;
+                error = PTR_ERR(mntroot);
-        server->acdirmax = data->acdirmax*HZ;
+                goto error_splat_super;
+        }
-        server->rpc_ops = &nfs_v4_clientops;
+        s->s_flags |= MS_ACTIVE;
+        mnt->mnt_sb = s;
+        mnt->mnt_root = mntroot;
-        nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
+        dprintk("<-- nfs_xdev_get_sb() = 0\n");
+        return 0;
-        server->retrans_timeo = timeparms.to_initval;
+out_err_nosb:
-        server->retrans_count = timeparms.to_retries;
+        nfs_free_server(server);
+out_err_noserver:
+        dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
+        return error;
-        /* Now create transport and client */
+error_splat_super:
-        authflavour = RPC_AUTH_UNIX;
+        up_write(&s->s_umount);
-        if (data->auth_flavourlen != 0) {
+        deactivate_super(s);
-                if (data->auth_flavourlen != 1) {
+        dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
-                        dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+        return error;
-                                        __FUNCTION__, data->auth_flavourlen);
+}
-                        err = -EINVAL;
-                        goto out_fail;
-                }
-                if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-                        err = -EFAULT;
-                        goto out_fail;
-                }
-        }
-        server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+#ifdef CONFIG_NFS_V4
-        if (IS_ERR(server->client)) {
-                err = PTR_ERR(server->client);
-                        dprintk("%s: cannot create RPC client. Error = %d\n",
-                                        __FUNCTION__, err);
-                        goto out_fail;
-        }
+/*
+ * Finish setting up a cloned NFS4 superblock
+ */
+static void nfs4_clone_super(struct super_block *sb,
+                            const struct super_block *old_sb)
+{
+        sb->s_blocksize_bits = old_sb->s_blocksize_bits;
+        sb->s_blocksize = old_sb->s_blocksize;
+        sb->s_maxbytes = old_sb->s_maxbytes;
        sb->s_time_gran = 1;
+        sb->s_op = old_sb->s_op;
-        sb->s_op = &nfs4_sops;
+        nfs_initialise_sb(sb);
-        err = nfs_sb_init(sb, authflavour);
- out_fail:
-        return err;
 }
-static int nfs4_compare_super(struct super_block *sb, void *data)
+/*
+ * Set up an NFS4 superblock
+ */
+static void nfs4_fill_super(struct super_block *sb)
 {
-        struct nfs_server *server = data;
+        sb->s_time_gran = 1;
-        struct nfs_server *old = NFS_SB(sb);
+        sb->s_op = &nfs4_sops;
+        nfs_initialise_sb(sb);
-        if (strcmp(server->hostname, old->hostname) != 0)
-                return 0;
-        if (strcmp(server->mnt_path, old->mnt_path) != 0)
-                return 0;
-        return 1;
 }
-static void *
+static void *nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
-nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 {
        void *p = NULL;
@@ -1297,14 +793,22 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
        return dst;
 }
+/*
+ * Get the superblock for an NFS4 mountpoint
+ */
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-        int error;
-        struct nfs_server *server;
-        struct super_block *s;
        struct nfs4_mount_data *data = raw_data;
+        struct super_block *s;
+        struct nfs_server *server;
+        struct sockaddr_in addr;
+        rpc_authflavor_t authflavour;
+        struct nfs_fh mntfh;
+        struct dentry *mntroot;
+        char *mntpath = NULL, *hostname = NULL, ip_addr[16];
        void *p;
+        int error;
        if (data == NULL) {
                dprintk("%s: missing data argument\n", __FUNCTION__);
@@ -1315,84 +819,112 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
                return -EINVAL;
        }
-        server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+        /* We now require that the mount process passes the remote address */
-        if (!server)
+        if (data->host_addrlen != sizeof(addr))
-                return -ENOMEM;
+                return -EINVAL;
-        /* Zero out the NFS state stuff */
-        init_nfsv4_state(server);
+        if (copy_from_user(&addr, data->host_addr, sizeof(addr)))
-        server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+                return -EFAULT;
+        if (addr.sin_family != AF_INET ||
+            addr.sin_addr.s_addr == INADDR_ANY
+            ) {
+                dprintk("%s: mount program didn't pass remote IP address!\n",
+                                __FUNCTION__);
+                return -EINVAL;
+        }
+        /* RFC3530: The default port for NFS is 2049 */
+        if (addr.sin_port == 0)
+                addr.sin_port = NFS_PORT;
+        /* Grab the authentication type */
+        authflavour = RPC_AUTH_UNIX;
+        if (data->auth_flavourlen != 0) {
+                if (data->auth_flavourlen != 1) {
+                        dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+                                        __FUNCTION__, data->auth_flavourlen);
+                        error = -EINVAL;
+                        goto out_err_noserver;
+                }
+                if (copy_from_user(&authflavour, data->auth_flavours,
+                                   sizeof(authflavour))) {
+                        error = -EFAULT;
+                        goto out_err_noserver;
+                }
+        }
        p = nfs_copy_user_string(NULL, &data->hostname, 256);
        if (IS_ERR(p))
                goto out_err;
-        server->hostname = p;
+        hostname = p;
        p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
        if (IS_ERR(p))
                goto out_err;
-        server->mnt_path = p;
+        mntpath = p;
+        dprintk("MNTPATH: %s\n", mntpath);
-        p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
+        p = nfs_copy_user_string(ip_addr, &data->client_addr,
-                        sizeof(server->ip_addr) - 1);
+                                 sizeof(ip_addr) - 1);
        if (IS_ERR(p))
                goto out_err;
-        /* We now require that the mount process passes the remote address */
+        /* Get a volume representation */
-        if (data->host_addrlen != sizeof(server->addr)) {
+        server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr,
-                error = -EINVAL;
+                                    authflavour, &mntfh);
-                goto out_free;
+        if (IS_ERR(server)) {
-        }
+                error = PTR_ERR(server);
-        if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
+                goto out_err_noserver;
-                error = -EFAULT;
-                goto out_free;
-        }
-        if (server->addr.sin_family != AF_INET ||
-            server->addr.sin_addr.s_addr == INADDR_ANY) {
-                dprintk("%s: mount program didn't pass remote IP address!\n",
-                                __FUNCTION__);
-                error = -EINVAL;
-                goto out_free;
-        }
-        /* Fire up rpciod if not yet running */
-        error = rpciod_up();
-        if (error < 0) {
-                dprintk("%s: couldn't start rpciod! Error = %d\n",
-                                __FUNCTION__, error);
-                goto out_free;
        }
-        s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
+        /* Get a superblock - note that we may end up sharing one that already exists */
+        s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto out_free;
        }
-        if (s->s_root) {
+        if (s->s_fs_info != server) {
-                kfree(server->mnt_path);
+                nfs_free_server(server);
-                kfree(server->hostname);
+                server = NULL;
-                kfree(server);
-                return simple_set_mnt(mnt, s);
        }
-        s->s_flags = flags;
+        if (!s->s_root) {
+                /* initial superblock/root creation */
+                s->s_flags = flags;
+                nfs4_fill_super(s);
+        }
-        error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+        mntroot = nfs4_get_root(s, &mntfh);
-        if (error) {
+        if (IS_ERR(mntroot)) {
-                up_write(&s->s_umount);
+                error = PTR_ERR(mntroot);
-                deactivate_super(s);
+                goto error_splat_super;
-                return error;
        }
        s->s_flags |= MS_ACTIVE;
-        return simple_set_mnt(mnt, s);
+        mnt->mnt_sb = s;
+        mnt->mnt_root = mntroot;
+        kfree(mntpath);
+        kfree(hostname);
+        return 0;
 out_err:
        error = PTR_ERR(p);
+        goto out_err_noserver;
 out_free:
-        kfree(server->mnt_path);
+        nfs_free_server(server);
-        kfree(server->hostname);
+out_err_noserver:
-        kfree(server);
+        kfree(mntpath);
+        kfree(hostname);
        return error;
+error_splat_super:
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        goto out_err_noserver;
 }
 static void nfs4_kill_super(struct super_block *sb)
@@ -1403,135 +935,140 @@ static void nfs4_kill_super(struct super_block *sb)
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
+        nfs_free_server(server);
+}
+/*
+ * Clone an NFS4 server record on xdev traversal (FSID-change)
+ */
+static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
+                            const char *dev_name, void *raw_data,
+                            struct vfsmount *mnt)
+{
+        struct nfs_clone_mount *data = raw_data;
+        struct super_block *s;
+        struct nfs_server *server;
+        struct dentry *mntroot;
+        int error;
+        dprintk("--> nfs4_xdev_get_sb()\n");
+        /* create a new volume representation */
+        server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+        if (IS_ERR(server)) {
+                error = PTR_ERR(server);
+                goto out_err_noserver;
+        }
+        /* Get a superblock - note that we may end up sharing one that already exists */
+        s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+        if (IS_ERR(s)) {
+                error = PTR_ERR(s);
+                goto out_err_nosb;
+        }
-        if (server->client != NULL && !IS_ERR(server->client))
+        if (s->s_fs_info != server) {
-                rpc_shutdown_client(server->client);
+                nfs_free_server(server);
+                server = NULL;
+        }
-        destroy_nfsv4_state(server);
+        if (!s->s_root) {
+                /* initial superblock/root creation */
+                s->s_flags = flags;
+                nfs4_clone_super(s, data->sb);
+        }
+        mntroot = nfs4_get_root(s, data->fh);
+        if (IS_ERR(mntroot)) {
+                error = PTR_ERR(mntroot);
+                goto error_splat_super;
+        }
-        rpciod_down();
+        s->s_flags |= MS_ACTIVE;
+        mnt->mnt_sb = s;
+        mnt->mnt_root = mntroot;
+        dprintk("<-- nfs4_xdev_get_sb() = 0\n");
+        return 0;
+out_err_nosb:
+        nfs_free_server(server);
+out_err_noserver:
+        dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
+        return error;
-        nfs_free_iostats(server->io_stats);
+error_splat_super:
-        kfree(server->hostname);
+        up_write(&s->s_umount);
-        kfree(server);
+        deactivate_super(s);
-        nfs_release_automount_timer();
+        dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
+        return error;
 }
 /*
- * Constructs the SERVER-side path
+ * Create an NFS4 server record on referral traversal
 */
-static inline char *nfs4_dup_path(const struct dentry *dentry)
+static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
+                                const char *dev_name, void *raw_data,
+                                struct vfsmount *mnt)
 {
-        char *page = (char *) __get_free_page(GFP_USER);
+        struct nfs_clone_mount *data = raw_data;
-        char *path;
+        struct super_block *s;
+        struct nfs_server *server;
+        struct dentry *mntroot;
+        struct nfs_fh mntfh;
+        int error;
-        path = nfs4_path(dentry, page, PAGE_SIZE);
+        dprintk("--> nfs4_referral_get_sb()\n");
-        if (!IS_ERR(path)) {
-                int len = PAGE_SIZE + page - path;
-                char *tmp = path;
-                path = kmalloc(len, GFP_KERNEL);
+        /* create a new volume representation */
-                if (path)
+        server = nfs4_create_referral_server(data, &mntfh);
-                        memcpy(path, tmp, len);
+        if (IS_ERR(server)) {
-                else
+                error = PTR_ERR(server);
-                        path = ERR_PTR(-ENOMEM);
+                goto out_err_noserver;
        }
-        free_page((unsigned long)page);
-        return path;
-}
-static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+        /* Get a superblock - note that we may end up sharing one that already exists */
-{
+        s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
-        const struct dentry *dentry = data->dentry;
+        if (IS_ERR(s)) {
-        struct nfs4_client *clp = server->nfs4_state;
+                error = PTR_ERR(s);
-        struct super_block *sb;
+                goto out_err_nosb;
-        server->fsid = data->fattr->fsid;
-        nfs_copy_fh(&server->fh, data->fh);
-        server->mnt_path = nfs4_dup_path(dentry);
-        if (IS_ERR(server->mnt_path)) {
-                sb = (struct super_block *)server->mnt_path;
-                goto err;
        }
-        sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
-        if (IS_ERR(sb) || sb->s_root)
-                goto free_path;
-        nfs4_server_capabilities(server, &server->fh);
-        down_write(&clp->cl_sem);
-        atomic_inc(&clp->cl_count);
-        list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-        up_write(&clp->cl_sem);
-        return sb;
-free_path:
-        kfree(server->mnt_path);
-err:
-        server->mnt_path = NULL;
-        return sb;
-}
-static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+        if (s->s_fs_info != server) {
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+                nfs_free_server(server);
-{
+                server = NULL;
-        struct nfs_clone_mount *data = raw_data;
+        }
-        return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt);
-}
-static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+        if (!s->s_root) {
-{
+                /* initial superblock/root creation */
-        struct super_block *sb = ERR_PTR(-ENOMEM);
+                s->s_flags = flags;
-        int len;
+                nfs4_fill_super(s);
+        }
-        len = strlen(data->mnt_path) + 1;
-        server->mnt_path = kmalloc(len, GFP_KERNEL);
-        if (server->mnt_path == NULL)
-                goto err;
-        memcpy(server->mnt_path, data->mnt_path, len);
-        memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
-        sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
-        if (IS_ERR(sb) || sb->s_root)
-                goto free_path;
-        return sb;
-free_path:
-        kfree(server->mnt_path);
-err:
-        server->mnt_path = NULL;
-        return sb;
-}
-static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+        mntroot = nfs4_get_root(s, data->fh);
-{
+        if (IS_ERR(mntroot)) {
-        struct nfs_server *server = NFS_SB(sb);
+                error = PTR_ERR(mntroot);
-        struct rpc_timeout timeparms;
+                goto error_splat_super;
-        int proto, timeo, retrans;
+        }
-        void *err;
-        proto = IPPROTO_TCP;
-        /* Since we are following a referral and there may be alternatives,
-           set the timeouts and retries to low values */
-        timeo = 2;
-        retrans = 1;
-        nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
-        server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
-        if (IS_ERR((err = server->client)))
-                goto out_err;
-        sb->s_time_gran = 1;
+        s->s_flags |= MS_ACTIVE;
-        sb->s_op = &nfs4_sops;
+        mnt->mnt_sb = s;
-        err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+        mnt->mnt_root = mntroot;
-        if (!IS_ERR(err))
-                return server;
-out_err:
-        return (struct nfs_server *)err;
-}
-static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+        dprintk("<-- nfs4_referral_get_sb() = 0\n");
-                int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        return 0;
-{
-        struct nfs_clone_mount *data = raw_data;
+out_err_nosb:
-        return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt);
+        nfs_free_server(server);
+out_err_noserver:
+        dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
+        return error;
+error_splat_super:
+        up_write(&s->s_umount);
+        deactivate_super(s);
+        dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
+        return error;
 }
-#endif
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7084ac9a6455..b674462793d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -396,6 +396,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 out:
        clear_bit(BDI_write_congested, &bdi->state);
        wake_up_all(&nfs_write_congestion);
+        writeback_congestion_end();
        return err;
 }
@@ -1252,7 +1253,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
        dprintk("NFS: %4d nfs_writeback_done (status %d)\n",
                task->tk_pid, task->tk_status);
-        /* Call the NFS version-specific code */
+        /*
+         * ->write_done will attempt to use post-op attributes to detect
+         * conflicting writes by other clients.  A strict interpretation
+         * of close-to-open would allow us to continue caching even if
+         * another writer had changed the file, but some applications
+         * depend on tighter cache coherency when writing.
+         */
        status = NFS_PROTO(data->inode)->write_done(task, data);
        if (status != 0)
                return status;
@@ -1273,7 +1280,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                if (time_before(complain, jiffies)) {
                        dprintk("NFS: faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
-                                NFS_SERVER(data->inode)->hostname,
+                                NFS_SERVER(data->inode)->nfs_client->cl_hostname,
                                resp->verf->committed, argp->stable);
                        complain = jiffies + 300 * HZ;
                }
@@ -1558,7 +1565,6 @@ void nfs_destroy_writepagecache(void)
 {
        mempool_destroy(nfs_commit_mempool);
        mempool_destroy(nfs_wdata_mempool);
-        if (kmem_cache_destroy(nfs_wdata_cachep))
+        kmem_cache_destroy(nfs_wdata_cachep);
-                printk(KERN_INFO "nfs_write_data: not all structures were freed\n");
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 54b37b1d2e3a..8583d99ee740 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -375,16 +375,28 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 {
        struct sockaddr_in      addr;
        struct nfs4_callback    *cb = &clp->cl_callback;
-        struct rpc_timeout      timeparms;
+        struct rpc_timeout      timeparms = {
-        struct rpc_xprt *       xprt;
+                .to_initval     = (NFSD_LEASE_TIME/4) * HZ,
+                .to_retries     = 5,
+                .to_maxval      = (NFSD_LEASE_TIME/2) * HZ,
+                .to_exponential = 1,
+        };
        struct rpc_program *    program = &cb->cb_program;
-        struct rpc_stat *       stat = &cb->cb_stat;
+        struct rpc_create_args args = {
-        struct rpc_clnt *       clnt;
+                .protocol       = IPPROTO_TCP,
+                .address        = (struct sockaddr *)&addr,
+                .addrsize       = sizeof(addr),
+                .timeout        = &timeparms,
+                .servername     = clp->cl_name.data,
+                .program        = program,
+                .version        = nfs_cb_version[1]->number,
+                .authflavor     = RPC_AUTH_UNIX,        /* XXX: need AUTH_GSS... */
+                .flags          = (RPC_CLNT_CREATE_NOPING),
+        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
                .rpc_argp       = clp,
        };
-        char                    hostname[32];
        int status;
        if (atomic_read(&cb->cb_set))
@@ -396,51 +408,27 @@ nfsd4_probe_callback(struct nfs4_client *clp)
        addr.sin_port = htons(cb->cb_port);
        addr.sin_addr.s_addr = htonl(cb->cb_addr);
-        /* Initialize timeout */
-        timeparms.to_initval = (NFSD_LEASE_TIME/4) * HZ;
-        timeparms.to_retries = 0;
-        timeparms.to_maxval = (NFSD_LEASE_TIME/2) * HZ;
-        timeparms.to_exponential = 1;
-        /* Create RPC transport */
-        xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms);
-        if (IS_ERR(xprt)) {
-                dprintk("NFSD: couldn't create callback transport!\n");
-                goto out_err;
-        }
        /* Initialize rpc_program */
        program->name = "nfs4_cb";
        program->number = cb->cb_prog;
        program->nrvers = ARRAY_SIZE(nfs_cb_version);
        program->version = nfs_cb_version;
-        program->stats = stat;
+        program->stats = &cb->cb_stat;
        /* Initialize rpc_stat */
-        memset(stat, 0, sizeof(struct rpc_stat));
+        memset(program->stats, 0, sizeof(cb->cb_stat));
-        stat->program = program;
+        program->stats->program = program;
-        /* Create RPC client
+        /* Create RPC client */
-         *
+        cb->cb_client = rpc_create(&args);
-         * XXX AUTH_UNIX only - need AUTH_GSS....
+        if (!cb->cb_client) {
-         */
-        sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr));
-        clnt = rpc_new_client(xprt, hostname, program, 1, RPC_AUTH_UNIX);
-        if (IS_ERR(clnt)) {
                dprintk("NFSD: couldn't create callback client\n");
                goto out_err;
        }
-        clnt->cl_intr = 0;
-        clnt->cl_softrtry = 1;
        /* Kick rpciod, put the call on the wire. */
+        if (rpciod_up() != 0)
-        if (rpciod_up() != 0) {
-                dprintk("nfsd: couldn't start rpciod for callbacks!\n");
                goto out_clnt;
-        }
-        cb->cb_client = clnt;
        /* the task holds a reference to the nfs4_client struct */
        atomic_inc(&clp->cl_count);
@@ -448,7 +436,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
        msg.rpc_cred = nfsd4_lookupcred(clp,0);
        if (IS_ERR(msg.rpc_cred))
                goto out_rpciod;
-        status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
+        status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL);
        put_rpccred(msg.rpc_cred);
        if (status != 0) {
@@ -462,7 +450,7 @@ out_rpciod:
        rpciod_down();
        cb->cb_client = NULL;
 out_clnt:
-        rpc_shutdown_client(clnt);
+        rpc_shutdown_client(cb->cb_client);
 out_err:
        dprintk("NFSD: warning: no callback path to client %.*s\n",
                (int)clp->cl_name.len, clp->cl_name.data);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index bea6b9478114..b1902ebaab41 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -573,10 +573,9 @@ idmap_lookup(struct svc_rqst *rqstp,
        struct idmap_defer_req *mdr;
        int ret;
-        mdr = kmalloc(sizeof(*mdr), GFP_KERNEL);
+        mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
        if (!mdr)
                return -ENOMEM;
-        memset(mdr, 0, sizeof(*mdr));
        atomic_set(&mdr->count, 1);
        init_waitqueue_head(&mdr->waitq);
        mdr->req.defer = idmap_defer;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9daa0b9feb8d..ebcf226a9e4a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -339,8 +339,7 @@ alloc_client(struct xdr_netobj name)
 {
        struct nfs4_client *clp;
-        if ((clp = kmalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
+        if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
-                memset(clp, 0, sizeof(*clp));
                if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) {
                        memcpy(clp->cl_name.data, name.data, name.len);
                        clp->cl_name.len = name.len;
@@ -1006,13 +1005,10 @@ alloc_init_file(struct inode *ino)
 static void
 nfsd4_free_slab(kmem_cache_t **slab)
 {
-        int status;
        if (*slab == NULL)
                return;
-        status = kmem_cache_destroy(*slab);
+        kmem_cache_destroy(*slab);
        *slab = NULL;
-        WARN_ON(status);
 }
 static void
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index d1e2c6f9f05e..85c36b8ca452 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1149,8 +1149,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         * Allocate a buffer to store the current name being processed
         * converted to format determined by current NLS.
         */
-        name = (u8*)kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1,
+        name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
-                        GFP_NOFS);
        if (unlikely(!name)) {
                err = -ENOMEM;
                goto err_out;
@@ -1191,7 +1190,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         * map the mft record without deadlocking.
         */
        rc = le32_to_cpu(ctx->attr->data.resident.value_length);
-        ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS);
+        ir = kmalloc(rc, GFP_NOFS);
        if (unlikely(!ir)) {
                err = -ENOMEM;
                goto err_out;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d313f356e66a..933dbd89c2a4 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -137,7 +137,7 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
                BUG_ON(!na->name);
                i = na->name_len * sizeof(ntfschar);
-                ni->name = (ntfschar*)kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+                ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
                if (!ni->name)
                        return -ENOMEM;
                memcpy(ni->name, na->name, i);
@@ -556,8 +556,6 @@ static int ntfs_read_locked_inode(struct inode *vi)
        /* Setup the generic vfs inode parts now. */
-        /* This is the optimal IO size (for stat), not the fs block size. */
-        vi->i_blksize = PAGE_CACHE_SIZE;
        /*
         * This is for checking whether an inode has changed w.r.t. a file so
         * that the file can be updated if necessary (compare with f_version).
@@ -1234,7 +1232,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
        base_ni = NTFS_I(base_vi);
        /* Just mirror the values from the base inode. */
-        vi->i_blksize   = base_vi->i_blksize;
        vi->i_version   = base_vi->i_version;
        vi->i_uid       = base_vi->i_uid;
        vi->i_gid       = base_vi->i_gid;
@@ -1504,7 +1501,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
        ni      = NTFS_I(vi);
        base_ni = NTFS_I(base_vi);
        /* Just mirror the values from the base inode. */
-        vi->i_blksize   = base_vi->i_blksize;
        vi->i_version   = base_vi->i_version;
        vi->i_uid       = base_vi->i_uid;
        vi->i_gid       = base_vi->i_gid;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2438c00ec0ce..584260fd6848 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -331,7 +331,7 @@ map_err_out:
                ntfs_inode **tmp;
                int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
-                tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
+                tmp = kmalloc(new_size, GFP_NOFS);
                if (unlikely(!tmp)) {
                        ntfs_error(base_ni->vol->sb, "Failed to allocate "
                                        "internal buffer.");
@@ -2638,11 +2638,6 @@ mft_rec_already_initialized:
                }
                vi->i_ino = bit;
                /*
-                 * This is the optimal IO size (for stat), not the fs block
-                 * size.
-                 */
-                vi->i_blksize = PAGE_CACHE_SIZE;
-                /*
                 * This is for checking whether an inode has changed w.r.t. a
                 * file so that the file can be updated if necessary (compare
                 * with f_version).
@@ -2893,7 +2888,7 @@ rollback:
        if (!(base_ni->nr_extents & 3)) {
                int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
-                extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS);
+                extent_nis = kmalloc(new_size, GFP_NOFS);
                if (unlikely(!extent_nis)) {
                        ntfs_error(vol->sb, "Failed to allocate internal "
                                        "buffer during rollback.%s", es);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 74e0ee8fce72..6b2712f10dd2 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3248,32 +3248,14 @@ ictx_err_out:
 static void __exit exit_ntfs_fs(void)
 {
-        int err = 0;
        ntfs_debug("Unregistering NTFS driver.");
        unregister_filesystem(&ntfs_fs_type);
+        kmem_cache_destroy(ntfs_big_inode_cache);
-        if (kmem_cache_destroy(ntfs_big_inode_cache) && (err = 1))
+        kmem_cache_destroy(ntfs_inode_cache);
-                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
+        kmem_cache_destroy(ntfs_name_cache);
-                                ntfs_big_inode_cache_name);
+        kmem_cache_destroy(ntfs_attr_ctx_cache);
-        if (kmem_cache_destroy(ntfs_inode_cache) && (err = 1))
+        kmem_cache_destroy(ntfs_index_ctx_cache);
-                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-                                ntfs_inode_cache_name);
-        if (kmem_cache_destroy(ntfs_name_cache) && (err = 1))
-                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-                                ntfs_name_cache_name);
-        if (kmem_cache_destroy(ntfs_attr_ctx_cache) && (err = 1))
-                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-                                ntfs_attr_ctx_cache_name);
-        if (kmem_cache_destroy(ntfs_index_ctx_cache) && (err = 1))
-                printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-                                ntfs_index_ctx_cache_name);
-        if (err)
-                printk(KERN_CRIT "NTFS: This causes memory to leak! There is "
-                                "probably a BUG in the driver! Please report "
-                                "you saw this message to "
-                                "linux-ntfs-dev@lists.sourceforge.net\n");
        /* Unregister the ntfs sysctls. */
        ntfs_sysctl(0);
 }
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index b123c0fa6bf6..a1b572196fe4 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -350,7 +350,7 @@ int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
                }
                if (!ns) {
                        ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
-                        ns = (unsigned char*)kmalloc(ns_len + 1, GFP_NOFS);
+                        ns = kmalloc(ns_len + 1, GFP_NOFS);
                        if (!ns)
                                goto mem_err_out;
                }
@@ -365,7 +365,7 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
                        else if (wc == -ENAMETOOLONG && ns != *outs) {
                                unsigned char *tc;
                                /* Grow in multiples of 64 bytes. */
-                                tc = (unsigned char*)kmalloc((ns_len + 64) &
+                                tc = kmalloc((ns_len + 64) &
                                                ~63, GFP_NOFS);
                                if (tc) {
                                        memcpy(tc, ns, ns_len);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index ff9e2e2104c2..4b46aac7d243 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,11 +44,17 @@
 * locking semantics of the file system using the protocol.  It should 
 * be somewhere else, I'm sure, but right now it isn't.
 *
+ * New in version 4:
+ *      - Remove i_generation from lock names for better stat performance.
+ *
+ * New in version 3:
+ *      - Replace dentry votes with a cluster lock
+ *
 * New in version 2:
 *      - full 64 bit i_size in the metadata lock lvbs
 *      - introduction of "rw" lock and pushing meta/data locking down
 */
-#define O2NET_PROTOCOL_VERSION 2ULL
+#define O2NET_PROTOCOL_VERSION 4ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 1a01380e3878..014e73978dac 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -35,15 +35,17 @@
 #include "alloc.h"
 #include "dcache.h"
+#include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
                                   struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
        int ret = 0;    /* if all else fails, just return false */
-        struct ocfs2_super *osb;
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -55,28 +57,31 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
                goto bail;
        }
-        osb = OCFS2_SB(inode->i_sb);
        BUG_ON(!osb);
-        if (inode != osb->root_inode) {
+        if (inode == osb->root_inode || is_bad_inode(inode))
-                spin_lock(&OCFS2_I(inode)->ip_lock);
+                goto bail;
-                /* did we or someone else delete this inode? */
-                if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
+        spin_lock(&OCFS2_I(inode)->ip_lock);
-                        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /* did we or someone else delete this inode? */
-                        mlog(0, "inode (%llu) deleted, returning false\n",
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
-                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                        goto bail;
-                }
                spin_unlock(&OCFS2_I(inode)->ip_lock);
+                mlog(0, "inode (%llu) deleted, returning false\n",
+                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                goto bail;
+        }
+        spin_unlock(&OCFS2_I(inode)->ip_lock);
-                if (!inode->i_nlink) {
+        /*
-                        mlog(0, "Inode %llu orphaned, returning false "
+         * We don't need a cluster lock to test this because once an
-                             "dir = %d\n",
+         * inode nlink hits zero, it never goes back.
-                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+         */
-                             S_ISDIR(inode->i_mode));
+        if (inode->i_nlink == 0) {
-                        goto bail;
+                mlog(0, "Inode %llu orphaned, returning false "
-                }
+                     "dir = %d\n",
+                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                     S_ISDIR(inode->i_mode));
+                goto bail;
        }
        ret = 1;
@@ -87,6 +92,322 @@ bail:
        return ret;
 }
+static int ocfs2_match_dentry(struct dentry *dentry,
+                              u64 parent_blkno,
+                              int skip_unhashed)
+{
+        struct inode *parent;
+        /*
+         * ocfs2_lookup() does a d_splice_alias() _before_ attaching
+         * to the lock data, so we skip those here, otherwise
+         * ocfs2_dentry_attach_lock() will get its original dentry
+         * back.
+         */
+        if (!dentry->d_fsdata)
+                return 0;
+        if (!dentry->d_parent)
+                return 0;
+        if (skip_unhashed && d_unhashed(dentry))
+                return 0;
+        parent = dentry->d_parent->d_inode;
+        /* Negative parent dentry? */
+        if (!parent)
+                return 0;
+        /* Name is in a different directory. */
+        if (OCFS2_I(parent)->ip_blkno != parent_blkno)
+                return 0;
+        return 1;
+}
+/*
+ * Walk the inode alias list, and find a dentry which has a given
+ * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it
+ * is looking for a dentry_lock reference. The vote thread is looking
+ * to unhash aliases, so we allow it to skip any that already have
+ * that property.
+ */
+struct dentry *ocfs2_find_local_alias(struct inode *inode,
+                                      u64 parent_blkno,
+                                      int skip_unhashed)
+{
+        struct list_head *p;
+        struct dentry *dentry = NULL;
+        spin_lock(&dcache_lock);
+        list_for_each(p, &inode->i_dentry) {
+                dentry = list_entry(p, struct dentry, d_alias);
+                if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
+                        mlog(0, "dentry found: %.*s\n",
+                             dentry->d_name.len, dentry->d_name.name);
+                        dget_locked(dentry);
+                        break;
+                }
+                dentry = NULL;
+        }
+        spin_unlock(&dcache_lock);
+        return dentry;
+}
+DEFINE_SPINLOCK(dentry_attach_lock);
+/*
+ * Attach this dentry to a cluster lock.
+ *
+ * Dentry locks cover all links in a given directory to a particular
+ * inode. We do this so that ocfs2 can build a lock name which all
+ * nodes in the cluster can agree on at all times. Shoving full names
+ * in the cluster lock won't work due to size restrictions. Covering
+ * links inside of a directory is a good compromise because it still
+ * allows us to use the parent directory lock to synchronize
+ * operations.
+ *
+ * Call this function with the parent dir semaphore and the parent dir
+ * cluster lock held.
+ *
+ * The dir semaphore will protect us from having to worry about
+ * concurrent processes on our node trying to attach a lock at the
+ * same time.
+ *
+ * The dir cluster lock (held at either PR or EX mode) protects us
+ * from unlink and rename on other nodes.
+ *
+ * A dput() can happen asynchronously due to pruning, so we cover
+ * attaching and detaching the dentry lock with a
+ * dentry_attach_lock.
+ *
+ * A node which has done lookup on a name retains a protected read
+ * lock until final dput. If the user requests and unlink or rename,
+ * the protected read is upgraded to an exclusive lock. Other nodes
+ * who have seen the dentry will then be informed that they need to
+ * downgrade their lock, which will involve d_delete on the
+ * dentry. This happens in ocfs2_dentry_convert_worker().
+ */
+int ocfs2_dentry_attach_lock(struct dentry *dentry,
+                             struct inode *inode,
+                             u64 parent_blkno)
+{
+        int ret;
+        struct dentry *alias;
+        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+        mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
+             dentry->d_name.len, dentry->d_name.name,
+             (unsigned long long)parent_blkno, dl);
+        /*
+         * Negative dentry. We ignore these for now.
+         *
+         * XXX: Could we can improve ocfs2_dentry_revalidate() by
+         * tracking these?
+         */
+        if (!inode)
+                return 0;
+        if (dl) {
+                mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
+                                " \"%.*s\": old parent: %llu, new: %llu\n",
+                                dentry->d_name.len, dentry->d_name.name,
+                                (unsigned long long)parent_blkno,
+                                (unsigned long long)dl->dl_parent_blkno);
+                return 0;
+        }
+        alias = ocfs2_find_local_alias(inode, parent_blkno, 0);
+        if (alias) {
+                /*
+                 * Great, an alias exists, which means we must have a
+                 * dentry lock already. We can just grab the lock off
+                 * the alias and add it to the list.
+                 *
+                 * We're depending here on the fact that this dentry
+                 * was found and exists in the dcache and so must have
+                 * a reference to the dentry_lock because we can't
+                 * race creates. Final dput() cannot happen on it
+                 * since we have it pinned, so our reference is safe.
+                 */
+                dl = alias->d_fsdata;
+                mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n",
+                                (unsigned long long)parent_blkno,
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
+                                " \"%.*s\": old parent: %llu, new: %llu\n",
+                                dentry->d_name.len, dentry->d_name.name,
+                                (unsigned long long)parent_blkno,
+                                (unsigned long long)dl->dl_parent_blkno);
+                mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+                goto out_attach;
+        }
+        /*
+         * There are no other aliases
+         */
+        dl = kmalloc(sizeof(*dl), GFP_NOFS);
+        if (!dl) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        dl->dl_count = 0;
+        /*
+         * Does this have to happen below, for all attaches, in case
+         * the struct inode gets blown away by votes?
+         */
+        dl->dl_inode = igrab(inode);
+        dl->dl_parent_blkno = parent_blkno;
+        ocfs2_dentry_lock_res_init(dl, parent_blkno, inode);
+out_attach:
+        spin_lock(&dentry_attach_lock);
+        dentry->d_fsdata = dl;
+        dl->dl_count++;
+        spin_unlock(&dentry_attach_lock);
+        /*
+         * This actually gets us our PRMODE level lock. From now on,
+         * we'll have a notification if one of these names is
+         * destroyed on another node.
+         */
+        ret = ocfs2_dentry_lock(dentry, 0);
+        if (!ret)
+                ocfs2_dentry_unlock(dentry, 0);
+        else
+                mlog_errno(ret);
+        dput(alias);
+        return ret;
+}
+/*
+ * ocfs2_dentry_iput() and friends.
+ *
+ * At this point, our particular dentry is detached from the inodes
+ * alias list, so there's no way that the locking code can find it.
+ *
+ * The interesting stuff happens when we determine that our lock needs
+ * to go away because this is the last subdir alias in the
+ * system. This function needs to handle a couple things:
+ *
+ * 1) Synchronizing lock shutdown with the downconvert threads. This
+ *    is already handled for us via the lockres release drop function
+ *    called in ocfs2_release_dentry_lock()
+ *
+ * 2) A race may occur when we're doing our lock shutdown and
+ *    another process wants to create a new dentry lock. Right now we
+ *    let them race, which means that for a very short while, this
+ *    node might have two locks on a lock resource. This should be a
+ *    problem though because one of them is in the process of being
+ *    thrown out.
+ */
+static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
+                                   struct ocfs2_dentry_lock *dl)
+{
+        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+        ocfs2_lock_res_free(&dl->dl_lockres);
+        iput(dl->dl_inode);
+        kfree(dl);
+}
+void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
+                           struct ocfs2_dentry_lock *dl)
+{
+        int unlock = 0;
+        BUG_ON(dl->dl_count == 0);
+        spin_lock(&dentry_attach_lock);
+        dl->dl_count--;
+        unlock = !dl->dl_count;
+        spin_unlock(&dentry_attach_lock);
+        if (unlock)
+                ocfs2_drop_dentry_lock(osb, dl);
+}
+static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+        mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED),
+                        "dentry: %.*s\n", dentry->d_name.len,
+                        dentry->d_name.name);
+        if (!dl)
+                goto out;
+        mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n",
+                        dentry->d_name.len, dentry->d_name.name,
+                        dl->dl_count);
+        ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl);
+out:
+        iput(inode);
+}
+/*
+ * d_move(), but keep the locks in sync.
+ *
+ * When we are done, "dentry" will have the parent dir and name of
+ * "target", which will be thrown away.
+ *
+ * We manually update the lock of "dentry" if need be.
+ *
+ * "target" doesn't have it's dentry lock touched - we allow the later
+ * dput() to handle this for us.
+ *
+ * This is called during ocfs2_rename(), while holding parent
+ * directory locks. The dentries have already been deleted on other
+ * nodes via ocfs2_remote_dentry_delete().
+ *
+ * Normally, the VFS handles the d_move() for the file sytem, after
+ * the ->rename() callback. OCFS2 wants to handle this internally, so
+ * the new lock can be created atomically with respect to the cluster.
+ */
+void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
+                       struct inode *old_dir, struct inode *new_dir)
+{
+        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb);
+        struct inode *inode = dentry->d_inode;
+        /*
+         * Move within the same directory, so the actual lock info won't
+         * change.
+         *
+         * XXX: Is there any advantage to dropping the lock here?
+         */
+        if (old_dir == new_dir)
+                goto out_move;
+        ocfs2_dentry_lock_put(osb, dentry->d_fsdata);
+        dentry->d_fsdata = NULL;
+        ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno);
+        if (ret)
+                mlog_errno(ret);
+out_move:
+        d_move(dentry, target);
+}
 struct dentry_operations ocfs2_dentry_ops = {
        .d_revalidate           = ocfs2_dentry_revalidate,
+        .d_iput                 = ocfs2_dentry_iput,
 };
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index 90072771114b..c091c34d9883 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -28,4 +28,31 @@
 extern struct dentry_operations ocfs2_dentry_ops;
+struct ocfs2_dentry_lock {
+        unsigned int            dl_count;
+        u64                     dl_parent_blkno;
+        /*
+         * The ocfs2_dentry_lock keeps an inode reference until
+         * dl_lockres has been destroyed. This is usually done in
+         * ->d_iput() anyway, so there should be minimal impact.
+         */
+        struct inode            *dl_inode;
+        struct ocfs2_lock_res   dl_lockres;
+};
+int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
+                             u64 parent_blkno);
+void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
+                           struct ocfs2_dentry_lock *dl);
+struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
+                                      int skip_unhashed);
+void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
+                       struct inode *old_dir, struct inode *new_dir);
+extern spinlock_t dentry_attach_lock;
 #endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
index 53652f51c0e1..cfd5cb65cab0 100644
--- a/fs/ocfs2/dlm/dlmapi.h
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -182,6 +182,7 @@ enum dlm_status dlmlock(struct dlm_ctxt *dlm,
                        struct dlm_lockstatus *lksb,
                        int flags,
                        const char *name,
+                        int namelen,
                        dlm_astlockfunc_t *ast,
                        void *data,
                        dlm_bastlockfunc_t *bast);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index f13a4bac41f0..681046d51393 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -320,8 +320,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
        res = dlm_lookup_lockres(dlm, name, locklen);
        if (!res) {
-                mlog(ML_ERROR, "got %sast for unknown lockres! "
+                mlog(0, "got %sast for unknown lockres! "
-                               "cookie=%u:%llu, name=%.*s, namelen=%u\n",
+                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
                     past->type == DLM_AST ? "" : "b",
                     dlm_get_lock_cookie_node(cookie),
                     dlm_get_lock_cookie_seq(cookie),
@@ -462,7 +462,7 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                        mlog(ML_ERROR, "sent AST to node %u, it returned "
                             "DLM_MIGRATING!\n", lock->ml.node);
                        BUG();
-                } else if (status != DLM_NORMAL) {
+                } else if (status != DLM_NORMAL && status != DLM_IVLOCKID) {
                        mlog(ML_ERROR, "AST to node %u returned %d!\n",
                             lock->ml.node, status);
                        /* ignore it */
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 14530ee7e11d..fa968180b072 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -747,6 +747,7 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
                              u8 owner);
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                                 const char *lockid,
+                                                 int namelen,
                                                 int flags);
 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                          const char *name,
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 033ad1701232..0368c6402182 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -335,7 +335,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
                inode->i_mode = mode;
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -362,7 +361,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_mode = mode;
        inode->i_uid = current->fsuid;
        inode->i_gid = current->fsgid;
-        inode->i_blksize = PAGE_CACHE_SIZE;
        inode->i_blocks = 0;
        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -629,9 +627,7 @@ static void __exit exit_dlmfs_fs(void)
        flush_workqueue(user_dlm_worker);
        destroy_workqueue(user_dlm_worker);
-        if (kmem_cache_destroy(dlmfs_inode_cache))
+        kmem_cache_destroy(dlmfs_inode_cache);
-                printk(KERN_INFO "dlmfs_inode_cache: not all structures "
-                       "were freed\n");
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 5ca57ec650c7..42a1b91979b5 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -540,8 +540,8 @@ static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
 enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
                        struct dlm_lockstatus *lksb, int flags,
-                        const char *name, dlm_astlockfunc_t *ast, void *data,
+                        const char *name, int namelen, dlm_astlockfunc_t *ast,
-                        dlm_bastlockfunc_t *bast)
+                        void *data, dlm_bastlockfunc_t *bast)
 {
        enum dlm_status status;
        struct dlm_lock_resource *res = NULL;
@@ -571,7 +571,7 @@ enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
        recovery = (flags & LKM_RECOVERY);
        if (recovery &&
-            (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
+            (!dlm_is_recovery_lock(name, namelen) || convert) ) {
                dlm_error(status);
                goto error;
        }
@@ -643,7 +643,7 @@ retry_convert:
                }
                status = DLM_IVBUFLEN;
-                if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
+                if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) {
                        dlm_error(status);
                        goto error;
                }
@@ -659,7 +659,7 @@ retry_convert:
                        dlm_wait_for_recovery(dlm);
                /* find or create the lock resource */
-                res = dlm_get_lock_resource(dlm, name, flags);
+                res = dlm_get_lock_resource(dlm, name, namelen, flags);
                if (!res) {
                        status = DLM_IVLOCKID;
                        dlm_error(status);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9503240ef0e5..f784177b6241 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -740,6 +740,7 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 */
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                          const char *lockid,
+                                          int namelen,
                                          int flags)
 {
        struct dlm_lock_resource *tmpres=NULL, *res=NULL;
@@ -748,13 +749,12 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
        int blocked = 0;
        int ret, nodenum;
        struct dlm_node_iter iter;
-        unsigned int namelen, hash;
+        unsigned int hash;
        int tries = 0;
        int bit, wait_on_recovery = 0;
        BUG_ON(!lockid);
-        namelen = strlen(lockid);
        hash = dlm_lockid_hash(lockid, namelen);
        mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 594745fab0b5..9d950d7cea38 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2285,7 +2285,8 @@ again:
        memset(&lksb, 0, sizeof(lksb));
        ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
-                      DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+                      DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
+                      dlm_reco_ast, dlm, dlm_reco_bast);
        mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
             dlm->name, ret, lksb.status);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index e641b084b343..eead48bbfac6 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -102,10 +102,10 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
        spin_unlock(&lockres->l_lock);
 }
-#define user_log_dlm_error(_func, _stat, _lockres) do {         \
+#define user_log_dlm_error(_func, _stat, _lockres) do {                 \
-        mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "  \
+        mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "          \
-                "resource %s: %s\n", dlm_errname(_stat), _func, \
+                "resource %.*s: %s\n", dlm_errname(_stat), _func,       \
-                _lockres->l_name, dlm_errmsg(_stat));           \
+                _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
 } while (0)
 /* WARNING: This function lives in a world where the only three lock
@@ -127,21 +127,22 @@ static void user_ast(void *opaque)
        struct user_lock_res *lockres = opaque;
        struct dlm_lockstatus *lksb;
-        mlog(0, "AST fired for lockres %s\n", lockres->l_name);
+        mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
+             lockres->l_name);
        spin_lock(&lockres->l_lock);
        lksb = &(lockres->l_lksb);
        if (lksb->status != DLM_NORMAL) {
-                mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
+                mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
-                     lksb->status, lockres->l_name);
+                     lksb->status, lockres->l_namelen, lockres->l_name);
                spin_unlock(&lockres->l_lock);
                return;
        }
        mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
-                        "Lockres %s, requested ivmode. flags 0x%x\n",
+                        "Lockres %.*s, requested ivmode. flags 0x%x\n",
-                        lockres->l_name, lockres->l_flags);
+                        lockres->l_namelen, lockres->l_name, lockres->l_flags);
        /* we're downconverting. */
        if (lockres->l_requested < lockres->l_level) {
@@ -213,8 +214,8 @@ static void user_bast(void *opaque, int level)
 {
        struct user_lock_res *lockres = opaque;
-        mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
+        mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
-                lockres->l_name, level);
+             lockres->l_namelen, lockres->l_name, level);
        spin_lock(&lockres->l_lock);
        lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -231,7 +232,8 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 {
        struct user_lock_res *lockres = opaque;
-        mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
+        mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
+             lockres->l_name);
        if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
                mlog(ML_ERROR, "Dlm returns status %d\n", status);
@@ -244,8 +246,6 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
            && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
                lockres->l_level = LKM_IVMODE;
        } else if (status == DLM_CANCELGRANT) {
-                mlog(0, "Lock %s, cancel fails, flags 0x%x\n",
-                     lockres->l_name, lockres->l_flags);
                /* We tried to cancel a convert request, but it was
                 * already granted. Don't clear the busy flag - the
                 * ast should've done this already. */
@@ -255,8 +255,6 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
        } else {
                BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
                /* Cancel succeeded, we want to re-queue */
-                mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n",
-                     lockres->l_name, lockres->l_flags);
                lockres->l_requested = LKM_IVMODE; /* cancel an
                                                    * upconvert
                                                    * request. */
@@ -287,13 +285,14 @@ static void user_dlm_unblock_lock(void *opaque)
        struct user_lock_res *lockres = (struct user_lock_res *) opaque;
        struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
-        mlog(0, "processing lockres %s\n", lockres->l_name);
+        mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
+             lockres->l_name);
        spin_lock(&lockres->l_lock);
        mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
-                        "Lockres %s, flags 0x%x\n",
+                        "Lockres %.*s, flags 0x%x\n",
-                        lockres->l_name, lockres->l_flags);
+                        lockres->l_namelen, lockres->l_name, lockres->l_flags);
        /* notice that we don't clear USER_LOCK_BLOCKED here. If it's
         * set, we want user_ast clear it. */
@@ -305,22 +304,16 @@ static void user_dlm_unblock_lock(void *opaque)
         * flag, and finally we might get another bast which re-queues
         * us before our ast for the downconvert is called. */
        if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
-                mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n",
-                        lockres->l_name, lockres->l_flags);
                spin_unlock(&lockres->l_lock);
                goto drop_ref;
        }
        if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
-                mlog(0, "lock is in teardown so we do nothing\n");
                spin_unlock(&lockres->l_lock);
                goto drop_ref;
        }
        if (lockres->l_flags & USER_LOCK_BUSY) {
-                mlog(0, "Cancel lock %s, flags 0x%x\n",
-                     lockres->l_name, lockres->l_flags);
                if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
                        spin_unlock(&lockres->l_lock);
                        goto drop_ref;
@@ -372,6 +365,7 @@ static void user_dlm_unblock_lock(void *opaque)
                         &lockres->l_lksb,
                         LKM_CONVERT|LKM_VALBLK,
                         lockres->l_name,
+                         lockres->l_namelen,
                         user_ast,
                         lockres,
                         user_bast);
@@ -420,16 +414,16 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
        if (level != LKM_EXMODE &&
            level != LKM_PRMODE) {
-                mlog(ML_ERROR, "lockres %s: invalid request!\n",
+                mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
-                     lockres->l_name);
+                     lockres->l_namelen, lockres->l_name);
                status = -EINVAL;
                goto bail;
        }
-        mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
+        mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
-                lockres->l_name,
+             lockres->l_namelen, lockres->l_name,
-                (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+             (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
-                lkm_flags);
+             lkm_flags);
 again:
        if (signal_pending(current)) {
@@ -474,15 +468,13 @@ again:
                BUG_ON(level == LKM_IVMODE);
                BUG_ON(level == LKM_NLMODE);
-                mlog(0, "lock %s, get lock from %d to level = %d\n",
-                        lockres->l_name, lockres->l_level, level);
                /* call dlm_lock to upgrade lock now */
                status = dlmlock(dlm,
                                 level,
                                 &lockres->l_lksb,
                                 local_flags,
                                 lockres->l_name,
+                                 lockres->l_namelen,
                                 user_ast,
                                 lockres,
                                 user_bast);
@@ -498,9 +490,6 @@ again:
                        goto bail;
                }
-                mlog(0, "lock %s, successfull return from dlmlock\n",
-                        lockres->l_name);
                user_wait_on_busy_lock(lockres);
                goto again;
        }
@@ -508,9 +497,6 @@ again:
        user_dlm_inc_holders(lockres, level);
        spin_unlock(&lockres->l_lock);
-        mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
-                (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
        status = 0;
 bail:
        return status;
@@ -538,13 +524,11 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
 {
        if (level != LKM_EXMODE &&
            level != LKM_PRMODE) {
-                mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
+                mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+                     lockres->l_namelen, lockres->l_name);
                return;
        }
-        mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
-                (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
        spin_lock(&lockres->l_lock);
        user_dlm_dec_holders(lockres, level);
        __user_dlm_cond_queue_lockres(lockres);
@@ -602,6 +586,7 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
        memcpy(lockres->l_name,
               dentry->d_name.name,
               dentry->d_name.len);
+        lockres->l_namelen = dentry->d_name.len;
 }
 int user_dlm_destroy_lock(struct user_lock_res *lockres)
@@ -609,11 +594,10 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
        int status = -EBUSY;
        struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
-        mlog(0, "asked to destroy %s\n", lockres->l_name);
+        mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
        spin_lock(&lockres->l_lock);
        if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
-                mlog(0, "Lock is already torn down\n");
                spin_unlock(&lockres->l_lock);
                return 0;
        }
@@ -623,8 +607,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
        while (lockres->l_flags & USER_LOCK_BUSY) {
                spin_unlock(&lockres->l_lock);
-                mlog(0, "lock %s is busy\n", lockres->l_name);
                user_wait_on_busy_lock(lockres);
                spin_lock(&lockres->l_lock);
@@ -632,14 +614,12 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
        if (lockres->l_ro_holders || lockres->l_ex_holders) {
                spin_unlock(&lockres->l_lock);
-                mlog(0, "lock %s has holders\n", lockres->l_name);
                goto bail;
        }
        status = 0;
        if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
                spin_unlock(&lockres->l_lock);
-                mlog(0, "lock %s is not attached\n", lockres->l_name);
                goto bail;
        }
@@ -647,7 +627,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
        lockres->l_flags |= USER_LOCK_BUSY;
        spin_unlock(&lockres->l_lock);
-        mlog(0, "unlocking lockres %s\n", lockres->l_name);
        status = dlmunlock(dlm,
                           &lockres->l_lksb,
                           LKM_VALBLK,
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
index 04178bc40b76..c400e93bbf79 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -53,6 +53,7 @@ struct user_lock_res {
 #define USER_DLM_LOCK_ID_MAX_LEN  32
        char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+        int                      l_namelen;
        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 151b41781eab..8801e41afe80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -46,6 +46,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "dcache.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "heartbeat.h"
@@ -66,78 +67,161 @@ struct ocfs2_mask_waiter {
        unsigned long           mw_goal;
 };
-static void ocfs2_inode_ast_func(void *opaque);
+static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
-static void ocfs2_inode_bast_func(void *opaque,
+static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
-                                  int level);
-static void ocfs2_super_ast_func(void *opaque);
-static void ocfs2_super_bast_func(void *opaque,
-                                  int level);
-static void ocfs2_rename_ast_func(void *opaque);
-static void ocfs2_rename_bast_func(void *opaque,
-                                   int level);
-/* so far, all locks have gotten along with the same unlock ast */
-static void ocfs2_unlock_ast_func(void *opaque,
-                                  enum dlm_status status);
-static int ocfs2_do_unblock_meta(struct inode *inode,
-                                 int *requeue);
-static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
-                              int *requeue);
-static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
-                              int *requeue);
-static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
-                              int *requeue);
-static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
-                                  int *requeue);
-typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
-static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
-                                      struct ocfs2_lock_res *lockres,
-                                      int *requeue,
-                                      ocfs2_convert_worker_t *worker);
+/*
+ * Return value from ->downconvert_worker functions.
+ *
+ * These control the precise actions of ocfs2_unblock_lock()
+ * and ocfs2_process_blocked_lock()
+ *
+ */
+enum ocfs2_unblock_action {
+        UNBLOCK_CONTINUE        = 0, /* Continue downconvert */
+        UNBLOCK_CONTINUE_POST   = 1, /* Continue downconvert, fire
+                                      * ->post_unlock callback */
+        UNBLOCK_STOP_POST       = 2, /* Do not downconvert, fire
+                                      * ->post_unlock() callback. */
+};
+struct ocfs2_unblock_ctl {
+        int requeue;
+        enum ocfs2_unblock_action unblock_action;
+};
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+                                        int new_level);
+static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
+static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+                                     int blocking);
+static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
+                                       int blocking);
+static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
+                                     struct ocfs2_lock_res *lockres);
+/*
+ * OCFS2 Lock Resource Operations
+ *
+ * These fine tune the behavior of the generic dlmglue locking infrastructure.
+ *
+ * The most basic of lock types can point ->l_priv to their respective
+ * struct ocfs2_super and allow the default actions to manage things.
+ *
+ * Right now, each lock type also needs to implement an init function,
+ * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
+ * should be called when the lock is no longer needed (i.e., object
+ * destruction time).
+ */
 struct ocfs2_lock_res_ops {
-        void (*ast)(void *);
+        /*
-        void (*bast)(void *, int);
+         * Translate an ocfs2_lock_res * into an ocfs2_super *. Define
-        void (*unlock_ast)(void *, enum dlm_status);
+         * this callback if ->l_priv is not an ocfs2_super pointer
-        int  (*unblock)(struct ocfs2_lock_res *, int *);
+         */
+        struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
+        /*
+         * Optionally called in the downconvert (or "vote") thread
+         * after a successful downconvert. The lockres will not be
+         * referenced after this callback is called, so it is safe to
+         * free memory, etc.
+         *
+         * The exact semantics of when this is called are controlled
+         * by ->downconvert_worker()
+         */
+        void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
+        /*
+         * Allow a lock type to add checks to determine whether it is
+         * safe to downconvert a lock. Return 0 to re-queue the
+         * downconvert at a later time, nonzero to continue.
+         *
+         * For most locks, the default checks that there are no
+         * incompatible holders are sufficient.
+         *
+         * Called with the lockres spinlock held.
+         */
+        int (*check_downconvert)(struct ocfs2_lock_res *, int);
+        /*
+         * Allows a lock type to populate the lock value block. This
+         * is called on downconvert, and when we drop a lock.
+         *
+         * Locks that want to use this should set LOCK_TYPE_USES_LVB
+         * in the flags field.
+         *
+         * Called with the lockres spinlock held.
+         */
+        void (*set_lvb)(struct ocfs2_lock_res *);
+        /*
+         * Called from the downconvert thread when it is determined
+         * that a lock will be downconverted. This is called without
+         * any locks held so the function can do work that might
+         * schedule (syncing out data, etc).
+         *
+         * This should return any one of the ocfs2_unblock_action
+         * values, depending on what it wants the thread to do.
+         */
+        int (*downconvert_worker)(struct ocfs2_lock_res *, int);
+        /*
+         * LOCK_TYPE_* flags which describe the specific requirements
+         * of a lock type. Descriptions of each individual flag follow.
+         */
+        int flags;
 };
+/*
+ * Some locks want to "refresh" potentially stale data when a
+ * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
+ * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
+ * individual lockres l_flags member from the ast function. It is
+ * expected that the locking wrapper will clear the
+ * OCFS2_LOCK_NEEDS_REFRESH flag when done.
+ */
+#define LOCK_TYPE_REQUIRES_REFRESH 0x1
+/*
+ * Indicate that a lock type makes use of the lock value block. The
+ * ->set_lvb lock type callback must be defined.
+ */
+#define LOCK_TYPE_USES_LVB              0x2
 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
-        .ast            = ocfs2_inode_ast_func,
+        .get_osb        = ocfs2_get_inode_osb,
-        .bast           = ocfs2_inode_bast_func,
+        .flags          = 0,
-        .unlock_ast     = ocfs2_unlock_ast_func,
-        .unblock        = ocfs2_unblock_inode_lock,
 };
 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
-        .ast            = ocfs2_inode_ast_func,
+        .get_osb        = ocfs2_get_inode_osb,
-        .bast           = ocfs2_inode_bast_func,
+        .check_downconvert = ocfs2_check_meta_downconvert,
-        .unlock_ast     = ocfs2_unlock_ast_func,
+        .set_lvb        = ocfs2_set_meta_lvb,
-        .unblock        = ocfs2_unblock_meta,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
 };
-static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
-                                      int blocking);
 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
-        .ast            = ocfs2_inode_ast_func,
+        .get_osb        = ocfs2_get_inode_osb,
-        .bast           = ocfs2_inode_bast_func,
+        .downconvert_worker = ocfs2_data_convert_worker,
-        .unlock_ast     = ocfs2_unlock_ast_func,
+        .flags          = 0,
-        .unblock        = ocfs2_unblock_data,
 };
 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
-        .ast            = ocfs2_super_ast_func,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH,
-        .bast           = ocfs2_super_bast_func,
-        .unlock_ast     = ocfs2_unlock_ast_func,
-        .unblock        = ocfs2_unblock_osb_lock,
 };
 static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
-        .ast            = ocfs2_rename_ast_func,
+        .flags          = 0,
-        .bast           = ocfs2_rename_bast_func,
+};
-        .unlock_ast     = ocfs2_unlock_ast_func,
-        .unblock        = ocfs2_unblock_osb_lock,
+static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
+        .get_osb        = ocfs2_get_dentry_osb,
+        .post_unlock    = ocfs2_dentry_post_unlock,
+        .downconvert_worker = ocfs2_dentry_convert_worker,
+        .flags          = 0,
 };
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
@@ -147,29 +231,26 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
                lockres->l_type == OCFS2_LOCK_TYPE_RW;
 }
-static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
+static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
 {
-        return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
+        BUG_ON(!ocfs2_is_inode_lock(lockres));
-}
-static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
+        return (struct inode *) lockres->l_priv;
-{
-        return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
 }
-static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
+static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
 {
-        BUG_ON(!ocfs2_is_super_lock(lockres)
+        BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
-               && !ocfs2_is_rename_lock(lockres));
-        return (struct ocfs2_super *) lockres->l_priv;
+        return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
-static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
+static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
-        BUG_ON(!ocfs2_is_inode_lock(lockres));
+        if (lockres->l_ops->get_osb)
+                return lockres->l_ops->get_osb(lockres);
-        return (struct inode *) lockres->l_priv;
+        return (struct ocfs2_super *)lockres->l_priv;
 }
 static int ocfs2_lock_create(struct ocfs2_super *osb,
@@ -200,25 +281,6 @@ static int ocfs2_meta_lock_update(struct inode *inode,
                                  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
-static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
-                                                  struct ocfs2_lock_res *lockres,
-                                                  int new_level);
-static char *ocfs2_lock_type_strings[] = {
-        [OCFS2_LOCK_TYPE_META] = "Meta",
-        [OCFS2_LOCK_TYPE_DATA] = "Data",
-        [OCFS2_LOCK_TYPE_SUPER] = "Super",
-        [OCFS2_LOCK_TYPE_RENAME] = "Rename",
-        /* Need to differntiate from [R]ename.. serializing writes is the
-         * important job it does, anyway. */
-        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
-};
-static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
-{
-        mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
-        return ocfs2_lock_type_strings[type];
-}
 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
                                  u64 blkno,
@@ -265,13 +327,9 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
                                       struct ocfs2_lock_res *res,
                                       enum ocfs2_lock_type type,
-                                       u64 blkno,
-                                       u32 generation,
                                       struct ocfs2_lock_res_ops *ops,
                                       void *priv)
 {
-        ocfs2_build_lock_name(type, blkno, generation, res->l_name);
        res->l_type          = type;
        res->l_ops           = ops;
        res->l_priv          = priv;
@@ -299,6 +357,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               enum ocfs2_lock_type type,
+                               unsigned int generation,
                               struct inode *inode)
 {
        struct ocfs2_lock_res_ops *ops;
@@ -319,9 +378,73 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                        break;
        };
-        ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
+        ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
-                                   OCFS2_I(inode)->ip_blkno,
+                              generation, res->l_name);
-                                   inode->i_generation, ops, inode);
+        ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
+}
+static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
+{
+        struct inode *inode = ocfs2_lock_res_inode(lockres);
+        return OCFS2_SB(inode->i_sb);
+}
+static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
+{
+        __be64 inode_blkno_be;
+        memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
+               sizeof(__be64));
+        return be64_to_cpu(inode_blkno_be);
+}
+static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_dentry_lock *dl = lockres->l_priv;
+        return OCFS2_SB(dl->dl_inode->i_sb);
+}
+void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
+                                u64 parent, struct inode *inode)
+{
+        int len;
+        u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
+        __be64 inode_blkno_be = cpu_to_be64(inode_blkno);
+        struct ocfs2_lock_res *lockres = &dl->dl_lockres;
+        ocfs2_lock_res_init_once(lockres);
+        /*
+         * Unfortunately, the standard lock naming scheme won't work
+         * here because we have two 16 byte values to use. Instead,
+         * we'll stuff the inode number as a binary value. We still
+         * want error prints to show something without garbling the
+         * display, so drop a null byte in there before the inode
+         * number. A future version of OCFS2 will likely use all
+         * binary lock names. The stringified names have been a
+         * tremendous aid in debugging, but now that the debugfs
+         * interface exists, we can mangle things there if need be.
+         *
+         * NOTE: We also drop the standard "pad" value (the total lock
+         * name size stays the same though - the last part is all
+         * zeros due to the memset in ocfs2_lock_res_init_once()
+         */
+        len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
+                       "%c%016llx",
+                       ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
+                       (long long)parent);
+        BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
+        memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
+               sizeof(__be64));
+        ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+                                   OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
+                                   dl);
 }
 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
@@ -330,8 +453,9 @@ static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
        /* Superblock lockres doesn't come from a slab so we call init
         * once on it manually.  */
        ocfs2_lock_res_init_once(res);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
+                              0, res->l_name);
        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
-                                   OCFS2_SUPER_BLOCK_BLKNO, 0,
                                   &ocfs2_super_lops, osb);
 }
@@ -341,7 +465,8 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
        /* Rename lockres doesn't come from a slab so we call init
         * once on it manually.  */
        ocfs2_lock_res_init_once(res);
-        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
+        ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
                                   &ocfs2_rename_lops, osb);
 }
@@ -495,7 +620,8 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
         * information is already up to data. Convert from NL to
         * *anything* however should mark ourselves as needing an
         * update */
-        if (lockres->l_level == LKM_NLMODE)
+        if (lockres->l_level == LKM_NLMODE &&
+            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
        lockres->l_level = lockres->l_requested;
@@ -512,7 +638,8 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
        if (lockres->l_requested > LKM_NLMODE &&
-            !(lockres->l_flags & OCFS2_LOCK_LOCAL))
+            !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
+            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
        lockres->l_level = lockres->l_requested;
@@ -522,68 +649,6 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
        mlog_exit_void();
 }
-static void ocfs2_inode_ast_func(void *opaque)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        struct inode *inode;
-        struct dlm_lockstatus *lksb;
-        unsigned long flags;
-        mlog_entry_void();
-        inode = ocfs2_lock_res_inode(lockres);
-        mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action,
-             ocfs2_lock_type_string(lockres->l_type));
-        BUG_ON(!ocfs2_is_inode_lock(lockres));
-        spin_lock_irqsave(&lockres->l_lock, flags);
-        lksb = &(lockres->l_lksb);
-        if (lksb->status != DLM_NORMAL) {
-                mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
-                     "on inode %llu\n", lksb->status,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                mlog_exit_void();
-                return;
-        }
-        switch(lockres->l_action) {
-        case OCFS2_AST_ATTACH:
-                ocfs2_generic_handle_attach_action(lockres);
-                lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
-                break;
-        case OCFS2_AST_CONVERT:
-                ocfs2_generic_handle_convert_action(lockres);
-                break;
-        case OCFS2_AST_DOWNCONVERT:
-                ocfs2_generic_handle_downconvert_action(lockres);
-                break;
-        default:
-                mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
-                     "lockres flags = 0x%lx, unlock action: %u\n",
-                     lockres->l_name, lockres->l_action, lockres->l_flags,
-                     lockres->l_unlock_action);
-                BUG();
-        }
-        /* data and rw locking ignores refresh flag for now. */
-        if (lockres->l_type != OCFS2_LOCK_TYPE_META)
-                lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
-        /* set it to something invalid so if we get called again we
-         * can catch it. */
-        lockres->l_action = OCFS2_AST_INVALID;
-        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        wake_up(&lockres->l_event);
-        mlog_exit_void();
-}
 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
                                     int level)
 {
@@ -610,54 +675,33 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
        return needs_downconvert;
 }
-static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
+static void ocfs2_blocking_ast(void *opaque, int level)
-                                    struct ocfs2_lock_res *lockres,
-                                    int level)
 {
+        struct ocfs2_lock_res *lockres = opaque;
+        struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
        int needs_downconvert;
        unsigned long flags;
-        mlog_entry_void();
        BUG_ON(level <= LKM_NLMODE);
+        mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
+             lockres->l_name, level, lockres->l_level,
+             ocfs2_lock_type_string(lockres->l_type));
        spin_lock_irqsave(&lockres->l_lock, flags);
        needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
        if (needs_downconvert)
                ocfs2_schedule_blocked_lock(osb, lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ocfs2_kick_vote_thread(osb);
        wake_up(&lockres->l_event);
-        mlog_exit_void();
-}
-static void ocfs2_inode_bast_func(void *opaque, int level)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        struct inode *inode;
-        struct ocfs2_super *osb;
-        mlog_entry_void();
+        ocfs2_kick_vote_thread(osb);
-        BUG_ON(!ocfs2_is_inode_lock(lockres));
-        inode = ocfs2_lock_res_inode(lockres);
-        osb = OCFS2_SB(inode->i_sb);
-        mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, level,
-             lockres->l_level, ocfs2_lock_type_string(lockres->l_type));
-        ocfs2_generic_bast_func(osb, lockres, level);
-        mlog_exit_void();
 }
-static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
+static void ocfs2_locking_ast(void *opaque)
-                                   int ignore_refresh)
 {
+        struct ocfs2_lock_res *lockres = opaque;
        struct dlm_lockstatus *lksb = &lockres->l_lksb;
        unsigned long flags;
@@ -673,6 +717,7 @@ static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
        switch(lockres->l_action) {
        case OCFS2_AST_ATTACH:
                ocfs2_generic_handle_attach_action(lockres);
+                lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
                break;
        case OCFS2_AST_CONVERT:
                ocfs2_generic_handle_convert_action(lockres);
@@ -681,80 +726,19 @@ static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
                ocfs2_generic_handle_downconvert_action(lockres);
                break;
        default:
+                mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
+                     "lockres flags = 0x%lx, unlock action: %u\n",
+                     lockres->l_name, lockres->l_action, lockres->l_flags,
+                     lockres->l_unlock_action);
                BUG();
        }
-        if (ignore_refresh)
-                lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
        /* set it to something invalid so if we get called again we
         * can catch it. */
        lockres->l_action = OCFS2_AST_INVALID;
-        spin_unlock_irqrestore(&lockres->l_lock, flags);
        wake_up(&lockres->l_event);
-}
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
-static void ocfs2_super_ast_func(void *opaque)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        mlog_entry_void();
-        mlog(0, "Superblock AST fired\n");
-        BUG_ON(!ocfs2_is_super_lock(lockres));
-        ocfs2_generic_ast_func(lockres, 0);
-        mlog_exit_void();
-}
-static void ocfs2_super_bast_func(void *opaque,
-                                  int level)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        struct ocfs2_super *osb;
-        mlog_entry_void();
-        mlog(0, "Superblock BAST fired\n");
-        BUG_ON(!ocfs2_is_super_lock(lockres));
-        osb = ocfs2_lock_res_super(lockres);
-        ocfs2_generic_bast_func(osb, lockres, level);
-        mlog_exit_void();
-}
-static void ocfs2_rename_ast_func(void *opaque)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        mlog_entry_void();
-        mlog(0, "Rename AST fired\n");
-        BUG_ON(!ocfs2_is_rename_lock(lockres));
-        ocfs2_generic_ast_func(lockres, 1);
-        mlog_exit_void();
-}
-static void ocfs2_rename_bast_func(void *opaque,
-                                   int level)
-{
-        struct ocfs2_lock_res *lockres = opaque;
-        struct ocfs2_super *osb;
-        mlog_entry_void();
-        mlog(0, "Rename BAST fired\n");
-        BUG_ON(!ocfs2_is_rename_lock(lockres));
-        osb = ocfs2_lock_res_super(lockres);
-        ocfs2_generic_bast_func(osb, lockres, level);
-        mlog_exit_void();
 }
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
@@ -810,9 +794,10 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
                         &lockres->l_lksb,
                         dlm_flags,
                         lockres->l_name,
-                         lockres->l_ops->ast,
+                         OCFS2_LOCK_ID_MAX_LEN - 1,
+                         ocfs2_locking_ast,
                         lockres,
-                         lockres->l_ops->bast);
+                         ocfs2_blocking_ast);
        if (status != DLM_NORMAL) {
                ocfs2_log_dlm_error("dlmlock", status, lockres);
                ret = -EINVAL;
@@ -930,6 +915,9 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb,
        ocfs2_init_mask_waiter(&mw);
+        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+                lkm_flags |= LKM_VALBLK;
 again:
        wait = 0;
@@ -997,11 +985,12 @@ again:
                status = dlmlock(osb->dlm,
                                 level,
                                 &lockres->l_lksb,
-                                 lkm_flags|LKM_CONVERT|LKM_VALBLK,
+                                 lkm_flags|LKM_CONVERT,
                                 lockres->l_name,
-                                 lockres->l_ops->ast,
+                                 OCFS2_LOCK_ID_MAX_LEN - 1,
+                                 ocfs2_locking_ast,
                                 lockres,
-                                 lockres->l_ops->bast);
+                                 ocfs2_blocking_ast);
                if (status != DLM_NORMAL) {
                        if ((lkm_flags & LKM_NOQUEUE) &&
                            (status == DLM_NOTQUEUED))
@@ -1074,18 +1063,21 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
        mlog_exit_void();
 }
-static int ocfs2_create_new_inode_lock(struct inode *inode,
+int ocfs2_create_new_lock(struct ocfs2_super *osb,
-                                       struct ocfs2_lock_res *lockres)
+                          struct ocfs2_lock_res *lockres,
+                          int ex,
+                          int local)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int level =  ex ? LKM_EXMODE : LKM_PRMODE;
        unsigned long flags;
+        int lkm_flags = local ? LKM_LOCAL : 0;
        spin_lock_irqsave(&lockres->l_lock, flags);
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
        lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
+        return ocfs2_lock_create(osb, lockres, level, lkm_flags);
 }
 /* Grants us an EX lock on the data and metadata resources, skipping
@@ -1097,6 +1089,7 @@ static int ocfs2_create_new_inode_lock(struct inode *inode,
 int ocfs2_create_new_inode_locks(struct inode *inode)
 {
        int ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        BUG_ON(!inode);
        BUG_ON(!ocfs2_inode_is_new(inode));
@@ -1113,22 +1106,23 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
         * on a resource which has an invalid one -- we'll set it
         * valid when we release the EX. */
-        ret = ocfs2_create_new_inode_lock(inode,
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
-                                          &OCFS2_I(inode)->ip_rw_lockres);
        if (ret) {
                mlog_errno(ret);
                goto bail;
        }
-        ret = ocfs2_create_new_inode_lock(inode,
+        /*
-                                          &OCFS2_I(inode)->ip_meta_lockres);
+         * We don't want to use LKM_LOCAL on a meta data lock as they
+         * don't use a generation in their lock names.
+         */
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
        if (ret) {
                mlog_errno(ret);
                goto bail;
        }
-        ret = ocfs2_create_new_inode_lock(inode,
+        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
-                                          &OCFS2_I(inode)->ip_data_lockres);
        if (ret) {
                mlog_errno(ret);
                goto bail;
@@ -1317,7 +1311,17 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
-        lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
+        /*
+         * Invalidate the LVB of a deleted inode - this way other
+         * nodes are forced to go to disk and discover the new inode
+         * status.
+         */
+        if (oi->ip_flags & OCFS2_INODE_DELETED) {
+                lvb->lvb_version = 0;
+                goto out;
+        }
+        lvb->lvb_version   = OCFS2_LVB_VERSION;
        lvb->lvb_isize     = cpu_to_be64(i_size_read(inode));
        lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
        lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
@@ -1331,7 +1335,9 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        lvb->lvb_imtime_packed =
                cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
        lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
+        lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
+out:
        mlog_meta_lvb(0, lockres);
        mlog_exit_void();
@@ -1386,11 +1392,13 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_exit_void();
 }
-static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
+static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
+                                              struct ocfs2_lock_res *lockres)
 {
        struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
-        if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
+        if (lvb->lvb_version == OCFS2_LVB_VERSION
+            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
                return 1;
        return 0;
 }
@@ -1487,7 +1495,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
         * map (directories, bitmap files, etc) */
        ocfs2_extent_map_trunc(inode, 0);
-        if (ocfs2_meta_lvb_is_trustable(lockres)) {
+        if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
                mlog(0, "Trusting LVB on inode %llu\n",
                     (unsigned long long)oi->ip_blkno);
                ocfs2_refresh_inode_from_lvb(inode);
@@ -1628,6 +1636,18 @@ int ocfs2_meta_lock_full(struct inode *inode,
                wait_event(osb->recovery_event,
                           ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+        /*
+         * We only see this flag if we're being called from
+         * ocfs2_read_locked_inode(). It means we're locking an inode
+         * which hasn't been populated yet, so clear the refresh flag
+         * and let the caller handle it.
+         */
+        if (inode->i_state & I_NEW) {
+                status = 0;
+                ocfs2_complete_lock_res_refresh(lockres, 0);
+                goto bail;
+        }
        /* This is fun. The caller may want a bh back, or it may
         * not. ocfs2_meta_lock_update definitely wants one in, but
         * may or may not read one, depending on what's in the
@@ -1807,6 +1827,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
        ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
 }
+int ocfs2_dentry_lock(struct dentry *dentry, int ex)
+{
+        int ret;
+        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+        BUG_ON(!dl);
+        if (ocfs2_is_hard_readonly(osb))
+                return -EROFS;
+        ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
+        if (ret < 0)
+                mlog_errno(ret);
+        return ret;
+}
+void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
+{
+        int level = ex ? LKM_EXMODE : LKM_PRMODE;
+        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+        ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
+}
 /* Reference counting of the dlm debug structure. We want this because
 * open references on the debug inodes can live on after a mount, so
 * we can't rely on the ocfs2_super to always exist. */
@@ -1937,9 +1985,16 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
        if (!lockres)
                return -EINVAL;
-        seq_printf(m, "0x%x\t"
+        seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
-                   "%.*s\t"
-                   "%d\t"
+        if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
+                seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
+                           lockres->l_name,
+                           (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
+        else
+                seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
+        seq_printf(m, "%d\t"
                   "0x%lx\t"
                   "0x%x\t"
                   "0x%x\t"
@@ -1947,8 +2002,6 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
                   "%u\t"
                   "%d\t"
                   "%d\t",
-                   OCFS2_DLM_DEBUG_STR_VERSION,
-                   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
                   lockres->l_level,
                   lockres->l_flags,
                   lockres->l_action,
@@ -1999,7 +2052,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
                mlog_errno(ret);
                goto out;
        }
-        osb = (struct ocfs2_super *) inode->u.generic_ip;
+        osb = inode->i_private;
        ocfs2_get_dlm_debug(osb->osb_dlm_debug);
        priv->p_dlm_debug = osb->osb_dlm_debug;
        INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
@@ -2138,7 +2191,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
        mlog_exit_void();
 }
-static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
+static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
 {
        struct ocfs2_lock_res *lockres = opaque;
        unsigned long flags;
@@ -2194,24 +2247,20 @@ complete_unlock:
        mlog_exit_void();
 }
-typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
-struct drop_lock_cb {
-        ocfs2_pre_drop_cb_t     *drop_func;
-        void                    *drop_data;
-};
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
-                           struct ocfs2_lock_res *lockres,
+                           struct ocfs2_lock_res *lockres)
-                           struct drop_lock_cb *dcb)
 {
        enum dlm_status status;
        unsigned long flags;
+        int lkm_flags = 0;
        /* We didn't get anywhere near actually using this lockres. */
        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
                goto out;
+        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+                lkm_flags |= LKM_VALBLK;
        spin_lock_irqsave(&lockres->l_lock, flags);
        mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
@@ -2234,8 +2283,12 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
                spin_lock_irqsave(&lockres->l_lock, flags);
        }
-        if (dcb)
+        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
-                dcb->drop_func(lockres, dcb->drop_data);
+                if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
+                    lockres->l_level == LKM_EXMODE &&
+                    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+                        lockres->l_ops->set_lvb(lockres);
+        }
        if (lockres->l_flags & OCFS2_LOCK_BUSY)
                mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
@@ -2261,8 +2314,8 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        mlog(0, "lock %s\n", lockres->l_name);
-        status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
+        status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
-                           lockres->l_ops->unlock_ast, lockres);
+                           ocfs2_unlock_ast, lockres);
        if (status != DLM_NORMAL) {
                ocfs2_log_dlm_error("dlmunlock", status, lockres);
                mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -2309,43 +2362,26 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
-static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
+void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
+                               struct ocfs2_lock_res *lockres)
 {
-        int status;
+        int ret;
-        mlog_entry_void();
-        ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
-        status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
-        if (status < 0)
-                mlog_errno(status);
-        ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
-        status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
-        if (status < 0)
-                mlog_errno(status);
-        mlog_exit(status);
+        ocfs2_mark_lockres_freeing(lockres);
+        ret = ocfs2_drop_lock(osb, lockres);
+        if (ret)
+                mlog_errno(ret);
 }
-static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
+static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 {
-        struct inode *inode = data;
+        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
+        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
-        /* the metadata lock requires a bit more work as we have an
-         * LVB to worry about. */
-        if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
-            lockres->l_level == LKM_EXMODE &&
-            !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
-                __ocfs2_stuff_meta_lvb(inode);
 }
 int ocfs2_drop_inode_locks(struct inode *inode)
 {
        int status, err;
-        struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
        mlog_entry_void();
@@ -2353,24 +2389,21 @@ int ocfs2_drop_inode_locks(struct inode *inode)
         * ocfs2_clear_inode has done it for us. */
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_data_lockres,
+                              &OCFS2_I(inode)->ip_data_lockres);
-                              NULL);
        if (err < 0)
                mlog_errno(err);
        status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_meta_lockres,
+                              &OCFS2_I(inode)->ip_meta_lockres);
-                              &meta_dcb);
        if (err < 0)
                mlog_errno(err);
        if (err < 0 && !status)
                status = err;
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                              &OCFS2_I(inode)->ip_rw_lockres,
+                              &OCFS2_I(inode)->ip_rw_lockres);
-                              NULL);
        if (err < 0)
                mlog_errno(err);
        if (err < 0 && !status)
@@ -2419,9 +2452,10 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                         &lockres->l_lksb,
                         dlm_flags,
                         lockres->l_name,
-                         lockres->l_ops->ast,
+                         OCFS2_LOCK_ID_MAX_LEN - 1,
+                         ocfs2_locking_ast,
                         lockres,
-                         lockres->l_ops->bast);
+                         ocfs2_blocking_ast);
        if (status != DLM_NORMAL) {
                ocfs2_log_dlm_error("dlmlock", status, lockres);
                ret = -EINVAL;
@@ -2480,7 +2514,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
        status = dlmunlock(osb->dlm,
                           &lockres->l_lksb,
                           LKM_CANCEL,
-                           lockres->l_ops->unlock_ast,
+                           ocfs2_unlock_ast,
                           lockres);
        if (status != DLM_NORMAL) {
                ocfs2_log_dlm_error("dlmunlock", status, lockres);
@@ -2494,115 +2528,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
        return ret;
 }
-static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
+static int ocfs2_unblock_lock(struct ocfs2_super *osb,
-                                                  struct ocfs2_lock_res *lockres,
+                              struct ocfs2_lock_res *lockres,
-                                                  int new_level)
+                              struct ocfs2_unblock_ctl *ctl)
-{
-        int ret;
-        mlog_entry_void();
-        BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
-        if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
-                ret = 0;
-                mlog(0, "lockres %s currently being refreshed -- backing "
-                     "off!\n", lockres->l_name);
-        } else if (new_level == LKM_PRMODE)
-                ret = !lockres->l_ex_holders &&
-                        ocfs2_inode_fully_checkpointed(inode);
-        else /* Must be NLMODE we're converting to. */
-                ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
-                        ocfs2_inode_fully_checkpointed(inode);
-        mlog_exit(ret);
-        return ret;
-}
-static int ocfs2_do_unblock_meta(struct inode *inode,
-                                 int *requeue)
-{
-        int new_level;
-        int set_lvb = 0;
-        int ret = 0;
-        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
-        unsigned long flags;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
-        spin_lock_irqsave(&lockres->l_lock, flags);
-        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-        mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
-             lockres->l_blocking);
-        BUG_ON(lockres->l_level != LKM_EXMODE &&
-               lockres->l_level != LKM_PRMODE);
-        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
-                *requeue = 1;
-                ret = ocfs2_prepare_cancel_convert(osb, lockres);
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                if (ret) {
-                        ret = ocfs2_cancel_convert(osb, lockres);
-                        if (ret < 0)
-                                mlog_errno(ret);
-                }
-                goto leave;
-        }
-        new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
-        mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
-             lockres->l_level, lockres->l_blocking, new_level);
-        if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
-                if (lockres->l_level == LKM_EXMODE)
-                        set_lvb = 1;
-                /* If the lock hasn't been refreshed yet (rare), then
-                 * our memory inode values are old and we skip
-                 * stuffing the lvb. There's no need to actually clear
-                 * out the lvb here as it's value is still valid. */
-                if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
-                        if (set_lvb)
-                                __ocfs2_stuff_meta_lvb(inode);
-                } else
-                        mlog(0, "lockres %s: downconverting stale lock!\n",
-                             lockres->l_name);
-                mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
-                     "l_blocking=%d, new_level=%d\n",
-                     lockres->l_level, lockres->l_blocking, new_level);
-                ocfs2_prepare_downconvert(lockres, new_level);
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
-                goto leave;
-        }
-        if (!ocfs2_inode_fully_checkpointed(inode))
-                ocfs2_start_checkpoint(osb);
-        *requeue = 1;
-        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = 0;
-leave:
-        mlog_exit(ret);
-        return ret;
-}
-static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
-                                      struct ocfs2_lock_res *lockres,
-                                      int *requeue,
-                                      ocfs2_convert_worker_t *worker)
 {
        unsigned long flags;
        int blocking;
        int new_level;
        int ret = 0;
+        int set_lvb = 0;
        mlog_entry_void();
@@ -2612,7 +2546,7 @@ static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
 recheck:
        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
-                *requeue = 1;
+                ctl->requeue = 1;
                ret = ocfs2_prepare_cancel_convert(osb, lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                if (ret) {
@@ -2626,27 +2560,33 @@ recheck:
        /* if we're blocking an exclusive and we have *any* holders,
         * then requeue. */
        if ((lockres->l_blocking == LKM_EXMODE)
-            && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+            && (lockres->l_ex_holders || lockres->l_ro_holders))
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                goto leave_requeue;
-                *requeue = 1;
-                ret = 0;
-                goto leave;
-        }
        /* If it's a PR we're blocking, then only
         * requeue if we've got any EX holders */
        if (lockres->l_blocking == LKM_PRMODE &&
-            lockres->l_ex_holders) {
+            lockres->l_ex_holders)
-                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                goto leave_requeue;
-                *requeue = 1;
-                ret = 0;
+        /*
-                goto leave;
+         * Can we get a lock in this state if the holder counts are
-        }
+         * zero? The meta data unblock code used to check this.
+         */
+        if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
+            && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
+                goto leave_requeue;
+        new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+        if (lockres->l_ops->check_downconvert
+            && !lockres->l_ops->check_downconvert(lockres, new_level))
+                goto leave_requeue;
        /* If we get here, then we know that there are no more
         * incompatible holders (and anyone asking for an incompatible
         * lock is blocked). We can now downconvert the lock */
-        if (!worker)
+        if (!lockres->l_ops->downconvert_worker)
                goto downconvert;
        /* Some lockres types want to do a bit of work before
@@ -2656,7 +2596,10 @@ recheck:
        blocking = lockres->l_blocking;
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        worker(lockres, blocking);
+        ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
+        if (ctl->unblock_action == UNBLOCK_STOP_POST)
+                goto leave;
        spin_lock_irqsave(&lockres->l_lock, flags);
        if (blocking != lockres->l_blocking) {
@@ -2666,25 +2609,43 @@ recheck:
        }
 downconvert:
-        *requeue = 0;
+        ctl->requeue = 0;
-        new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
+        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
+                if (lockres->l_level == LKM_EXMODE)
+                        set_lvb = 1;
+                /*
+                 * We only set the lvb if the lock has been fully
+                 * refreshed - otherwise we risk setting stale
+                 * data. Otherwise, there's no need to actually clear
+                 * out the lvb here as it's value is still valid.
+                 */
+                if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
+                        lockres->l_ops->set_lvb(lockres);
+        }
        ocfs2_prepare_downconvert(lockres, new_level);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
+        ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
 leave:
        mlog_exit(ret);
        return ret;
+leave_requeue:
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
+        ctl->requeue = 1;
+        mlog_exit(0);
+        return 0;
 }
-static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
+static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
-                                      int blocking)
+                                     int blocking)
 {
        struct inode *inode;
        struct address_space *mapping;
-        mlog_entry_void();
        inode = ocfs2_lock_res_inode(lockres);
        mapping = inode->i_mapping;
@@ -2705,116 +2666,159 @@ static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
                filemap_fdatawait(mapping);
        }
-        mlog_exit_void();
+        return UNBLOCK_CONTINUE;
 }
-int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
-                       int *requeue)
+                                        int new_level)
 {
-        int status;
+        struct inode *inode = ocfs2_lock_res_inode(lockres);
-        struct inode *inode;
+        int checkpointed = ocfs2_inode_fully_checkpointed(inode);
-        struct ocfs2_super *osb;
-        mlog_entry_void();
-        inode = ocfs2_lock_res_inode(lockres);
-        osb = OCFS2_SB(inode->i_sb);
-        mlog(0, "unblock inode %llu\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_generic_unblock_lock(osb,
+        BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
-                                            lockres,
+        BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
-                                            requeue,
-                                            ocfs2_data_convert_worker);
-        if (status < 0)
-                mlog_errno(status);
-        mlog(0, "inode %llu, requeue = %d\n",
+        if (checkpointed)
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
+                return 1;
-        mlog_exit(status);
+        ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
-        return status;
+        return 0;
 }
-static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
+static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
-                                    int *requeue)
 {
-        int status;
+        struct inode *inode = ocfs2_lock_res_inode(lockres);
-        struct inode *inode;
-        mlog_entry_void();
-        mlog(0, "Unblock lockres %s\n", lockres->l_name);
-        inode  = ocfs2_lock_res_inode(lockres);
-        status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
+        __ocfs2_stuff_meta_lvb(inode);
-                                            lockres,
-                                            requeue,
-                                            NULL);
-        if (status < 0)
-                mlog_errno(status);
-        mlog_exit(status);
-        return status;
 }
+/*
-int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
+ * Does the final reference drop on our dentry lock. Right now this
-                       int *requeue)
+ * happens in the vote thread, but we could choose to simplify the
+ * dlmglue API and push these off to the ocfs2_wq in the future.
+ */
+static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
+                                     struct ocfs2_lock_res *lockres)
 {
-        int status;
+        struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
-        struct inode *inode;
+        ocfs2_dentry_lock_put(osb, dl);
+}
-        mlog_entry_void();
-        inode = ocfs2_lock_res_inode(lockres);
+/*
+ * d_delete() matching dentries before the lock downconvert.
+ *
+ * At this point, any process waiting to destroy the
+ * dentry_lock due to last ref count is stopped by the
+ * OCFS2_LOCK_QUEUED flag.
+ *
+ * We have two potential problems
+ *
+ * 1) If we do the last reference drop on our dentry_lock (via dput)
+ *    we'll wind up in ocfs2_release_dentry_lock(), waiting on
+ *    the downconvert to finish. Instead we take an elevated
+ *    reference and push the drop until after we've completed our
+ *    unblock processing.
+ *
+ * 2) There might be another process with a final reference,
+ *    waiting on us to finish processing. If this is the case, we
+ *    detect it and exit out - there's no more dentries anyway.
+ */
+static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
+                                       int blocking)
+{
+        struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
+        struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
+        struct dentry *dentry;
+        unsigned long flags;
+        int extra_ref = 0;
-        mlog(0, "unblock inode %llu\n",
+        /*
-             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+         * This node is blocking another node from getting a read
+         * lock. This happens when we've renamed within a
+         * directory. We've forced the other nodes to d_delete(), but
+         * we never actually dropped our lock because it's still
+         * valid. The downconvert code will retain a PR for this node,
+         * so there's no further work to do.
+         */
+        if (blocking == LKM_PRMODE)
+                return UNBLOCK_CONTINUE;
-        status = ocfs2_do_unblock_meta(inode, requeue);
+        /*
-        if (status < 0)
+         * Mark this inode as potentially orphaned. The code in
-                mlog_errno(status);
+         * ocfs2_delete_inode() will figure out whether it actually
+         * needs to be freed or not.
+         */
+        spin_lock(&oi->ip_lock);
+        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+        spin_unlock(&oi->ip_lock);
-        mlog(0, "inode %llu, requeue = %d\n",
+        /*
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
+         * Yuck. We need to make sure however that the check of
+         * OCFS2_LOCK_FREEING and the extra reference are atomic with
+         * respect to a reference decrement or the setting of that
+         * flag.
+         */
+        spin_lock_irqsave(&lockres->l_lock, flags);
+        spin_lock(&dentry_attach_lock);
+        if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
+            && dl->dl_count) {
+                dl->dl_count++;
+                extra_ref = 1;
+        }
+        spin_unlock(&dentry_attach_lock);
+        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        mlog_exit(status);
+        mlog(0, "extra_ref = %d\n", extra_ref);
-        return status;
-}
-/* Generic unblock function for any lockres whose private data is an
+        /*
- * ocfs2_super pointer. */
+         * We have a process waiting on us in ocfs2_dentry_iput(),
-static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
+         * which means we can't have any more outstanding
-                                  int *requeue)
+         * aliases. There's no need to do any more work.
-{
+         */
-        int status;
+        if (!extra_ref)
-        struct ocfs2_super *osb;
+                return UNBLOCK_CONTINUE;
+        spin_lock(&dentry_attach_lock);
+        while (1) {
+                dentry = ocfs2_find_local_alias(dl->dl_inode,
+                                                dl->dl_parent_blkno, 1);
+                if (!dentry)
+                        break;
+                spin_unlock(&dentry_attach_lock);
-        mlog_entry_void();
+                mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
+                     dentry->d_name.name);
-        mlog(0, "Unblock lockres %s\n", lockres->l_name);
+                /*
+                 * The following dcache calls may do an
+                 * iput(). Normally we don't want that from the
+                 * downconverting thread, but in this case it's ok
+                 * because the requesting node already has an
+                 * exclusive lock on the inode, so it can't be queued
+                 * for a downconvert.
+                 */
+                d_delete(dentry);
+                dput(dentry);
-        osb = ocfs2_lock_res_super(lockres);
+                spin_lock(&dentry_attach_lock);
+        }
+        spin_unlock(&dentry_attach_lock);
-        status = ocfs2_generic_unblock_lock(osb,
+        /*
-                                            lockres,
+         * If we are the last holder of this dentry lock, there is no
-                                            requeue,
+         * reason to downconvert so skip straight to the unlock.
-                                            NULL);
+         */
-        if (status < 0)
+        if (dl->dl_count == 1)
-                mlog_errno(status);
+                return UNBLOCK_STOP_POST;
-        mlog_exit(status);
+        return UNBLOCK_CONTINUE_POST;
-        return status;
 }
 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                                struct ocfs2_lock_res *lockres)
 {
        int status;
-        int requeue = 0;
+        struct ocfs2_unblock_ctl ctl = {0, 0,};
        unsigned long flags;
        /* Our reference to the lockres in this function can be
@@ -2825,7 +2829,6 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
        BUG_ON(!lockres);
        BUG_ON(!lockres->l_ops);
-        BUG_ON(!lockres->l_ops->unblock);
        mlog(0, "lockres %s blocked.\n", lockres->l_name);
@@ -2839,21 +2842,25 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                goto unqueue;
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        status = lockres->l_ops->unblock(lockres, &requeue);
+        status = ocfs2_unblock_lock(osb, lockres, &ctl);
        if (status < 0)
                mlog_errno(status);
        spin_lock_irqsave(&lockres->l_lock, flags);
 unqueue:
-        if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
+        if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
                lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
        } else
                ocfs2_schedule_blocked_lock(osb, lockres);
        mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
-             requeue ? "yes" : "no");
+             ctl.requeue ? "yes" : "no");
        spin_unlock_irqrestore(&lockres->l_lock, flags);
+        if (ctl.unblock_action != UNBLOCK_CONTINUE
+            && lockres->l_ops->post_unlock)
+                lockres->l_ops->post_unlock(osb, lockres);
        mlog_exit_void();
 }
@@ -2896,8 +2903,9 @@ void ocfs2_dump_meta_lvb_info(u64 level,
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
-        mlog(level, "version: %u, clusters: %u\n",
+        mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
-             be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
+             lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
+             be32_to_cpu(lvb->lvb_igeneration));
        mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
             (unsigned long long)be64_to_cpu(lvb->lvb_isize),
             be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 243ae862ece5..4a2769387229 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -27,10 +27,14 @@
 #ifndef DLMGLUE_H
 #define DLMGLUE_H
-#define OCFS2_LVB_VERSION 3
+#include "dcache.h"
+#define OCFS2_LVB_VERSION 4
 struct ocfs2_meta_lvb {
-        __be32       lvb_version;
+        __u8         lvb_version;
+        __u8         lvb_reserved0;
+        __be16       lvb_reserved1;
        __be32       lvb_iclusters;
        __be32       lvb_iuid;
        __be32       lvb_igid;
@@ -41,7 +45,8 @@ struct ocfs2_meta_lvb {
        __be16       lvb_imode;
        __be16       lvb_inlink;
        __be32       lvb_iattr;
-        __be32       lvb_reserved[2];
+        __be32       lvb_igeneration;
+        __be32       lvb_reserved2;
 };
 /* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
@@ -57,9 +62,14 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               enum ocfs2_lock_type type,
+                               unsigned int generation,
                               struct inode *inode);
+void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
+                                u64 parent, struct inode *inode);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
+int ocfs2_create_new_lock(struct ocfs2_super *osb,
+                          struct ocfs2_lock_res *lockres, int ex, int local);
 int ocfs2_drop_inode_locks(struct inode *inode);
 int ocfs2_data_lock_full(struct inode *inode,
                         int write,
@@ -93,7 +103,12 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_dentry_lock(struct dentry *dentry, int ex);
+void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
+                               struct ocfs2_lock_res *lockres);
 /* for the vote thread */
 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index ec55ab3c1214..fb91089a60a7 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -33,6 +33,7 @@
 #include "dir.h"
 #include "dlmglue.h"
+#include "dcache.h"
 #include "export.h"
 #include "inode.h"
@@ -57,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
                return ERR_PTR(-ESTALE);
        }
-        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
+        inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0);
        if (IS_ERR(inode)) {
                mlog_errno(PTR_ERR(inode));
@@ -77,6 +78,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
                mlog_errno(-ENOMEM);
                return ERR_PTR(-ENOMEM);
        }
+        result->d_op = &ocfs2_dentry_ops;
        mlog_exit_ptr(result);
        return result;
@@ -113,7 +115,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
                goto bail_unlock;
        }
-        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
        if (IS_ERR(inode)) {
                mlog(ML_ERROR, "Unable to create inode %llu\n",
                     (unsigned long long)blkno);
@@ -127,6 +129,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
                parent = ERR_PTR(-ENOMEM);
        }
+        parent->d_op = &ocfs2_dentry_ops;
 bail_unlock:
        ocfs2_meta_unlock(dir, 0);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7bcf69154592..16e8e74dc966 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -54,8 +54,6 @@
 #include "buffer_head_io.h"
-#define OCFS2_FI_FLAG_NOWAIT    0x1
-#define OCFS2_FI_FLAG_DELETE    0x2
 struct ocfs2_find_inode_args
 {
        u64             fi_blkno;
@@ -109,7 +107,7 @@ struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
        return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
 }
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
@@ -127,7 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
        }
        args.fi_blkno = blkno;
-        args.fi_flags = 0;
+        args.fi_flags = flags;
        args.fi_ino = ino_from_blkno(sb, blkno);
        inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
@@ -271,7 +269,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_mode = le16_to_cpu(fe->i_mode);
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
-        inode->i_blksize = (u32)osb->s_clustersize;
        /* Fast symlinks will have i_size but no allocated clusters. */
        if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
@@ -297,15 +294,11 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
-        if (create_ino)
-                inode->i_ino = ino_from_blkno(inode->i_sb,
-                               le64_to_cpu(fe->i_blkno));
-        mlog(0, "blkno = %llu, ino = %lu, create_ino = %s\n",
-             (unsigned long long)fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
+        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
        if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
                mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
@@ -343,12 +336,28 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                    break;
        }
+        if (create_ino) {
+                inode->i_ino = ino_from_blkno(inode->i_sb,
+                               le64_to_cpu(fe->i_blkno));
+                /*
+                 * If we ever want to create system files from kernel,
+                 * the generation argument to
+                 * ocfs2_inode_lock_res_init() will have to change.
+                 */
+                BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL));
+                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+                                          OCFS2_LOCK_TYPE_META, 0, inode);
+        }
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
-                                  OCFS2_LOCK_TYPE_RW, inode);
+                                  OCFS2_LOCK_TYPE_RW, inode->i_generation,
-        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+                                  inode);
-                                  OCFS2_LOCK_TYPE_META, inode);
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
-                                  OCFS2_LOCK_TYPE_DATA, inode);
+                                  OCFS2_LOCK_TYPE_DATA, inode->i_generation,
+                                  inode);
        ocfs2_set_inode_flags(inode);
        inode->i_flags |= S_NOATIME;
@@ -366,15 +375,15 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        struct ocfs2_super *osb;
        struct ocfs2_dinode *fe;
        struct buffer_head *bh = NULL;
-        int status;
+        int status, can_lock;
-        int sysfile = 0;
+        u32 generation = 0;
        mlog_entry("(0x%p, 0x%p)\n", inode, args);
        status = -EINVAL;
        if (inode == NULL || inode->i_sb == NULL) {
                mlog(ML_ERROR, "bad inode\n");
-                goto bail;
+                return status;
        }
        sb = inode->i_sb;
        osb = OCFS2_SB(sb);
@@ -382,50 +391,110 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (!args) {
                mlog(ML_ERROR, "bad inode args\n");
                make_bad_inode(inode);
-                goto bail;
+                return status;
+        }
+        /*
+         * To improve performance of cold-cache inode stats, we take
+         * the cluster lock here if possible.
+         *
+         * Generally, OCFS2 never trusts the contents of an inode
+         * unless it's holding a cluster lock, so taking it here isn't
+         * a correctness issue as much as it is a performance
+         * improvement.
+         *
+         * There are three times when taking the lock is not a good idea:
+         *
+         * 1) During startup, before we have initialized the DLM.
+         *
+         * 2) If we are reading certain system files which never get
+         *    cluster locks (local alloc, truncate log).
+         *
+         * 3) If the process doing the iget() is responsible for
+         *    orphan dir recovery. We're holding the orphan dir lock and
+         *    can get into a deadlock with another process on another
+         *    node in ->delete_inode().
+         *
+         * #1 and #2 can be simply solved by never taking the lock
+         * here for system files (which are the only type we read
+         * during mount). It's a heavier approach, but our main
+         * concern is user-accesible files anyway.
+         *
+         * #3 works itself out because we'll eventually take the
+         * cluster lock before trusting anything anyway.
+         */
+        can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
+                && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK);
+        /*
+         * To maintain backwards compatibility with older versions of
+         * ocfs2-tools, we still store the generation value for system
+         * files. The only ones that actually matter to userspace are
+         * the journals, but it's easier and inexpensive to just flag
+         * all system files similarly.
+         */
+        if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
+                generation = osb->fs_generation;
+        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
+                                  OCFS2_LOCK_TYPE_META,
+                                  generation, inode);
+        if (can_lock) {
+                status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+                if (status) {
+                        make_bad_inode(inode);
+                        mlog_errno(status);
+                        return status;
+                }
        }
-        /* Read the FE off disk. This is safe because the kernel only
+        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
-         * does one read_inode2 for a new inode, and if it doesn't
+                                  can_lock ? inode : NULL);
-         * exist yet then nobody can be working on it! */
-        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
        if (status < 0) {
                mlog_errno(status);
-                make_bad_inode(inode);
                goto bail;
        }
+        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
                     (unsigned long long)fe->i_blkno, 7, fe->i_signature);
-                make_bad_inode(inode);
                goto bail;
        }
-        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+        /*
-                sysfile = 1;
+         * This is a code bug. Right now the caller needs to
+         * understand whether it is asking for a system file inode or
+         * not so the proper lock names can be built.
+         */
+        mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
+                        !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
+                        "Inode %llu: system file state is ambigous\n",
+                        (unsigned long long)args->fi_blkno);
        if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
            S_ISBLK(le16_to_cpu(fe->i_mode)))
                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
-        status = -EINVAL;
        if (ocfs2_populate_inode(inode, fe, 0) < 0) {
                mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n",
                     (unsigned long long)fe->i_blkno, inode->i_ino);
-                make_bad_inode(inode);
                goto bail;
        }
        BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
-        if (sysfile)
-               OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
        status = 0;
 bail:
+        if (can_lock)
+                ocfs2_meta_unlock(inode, 0);
+        if (status < 0)
+                make_bad_inode(inode);
        if (args && bh)
                brelse(bh);
@@ -898,9 +967,15 @@ void ocfs2_delete_inode(struct inode *inode)
                goto bail_unlock_inode;
        }
-        /* Mark the inode as successfully deleted. This is important
+        /*
-         * for ocfs2_clear_inode as it will check this flag and skip
+         * Mark the inode as successfully deleted.
-         * any checkpointing work */
+         *
+         * This is important for ocfs2_clear_inode() as it will check
+         * this flag and skip any checkpointing work
+         *
+         * ocfs2_stuff_meta_lvb() also uses this flag to invalidate
+         * the LVB for other nodes.
+         */
        OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
 bail_unlock_inode:
@@ -1025,12 +1100,10 @@ void ocfs2_drop_inode(struct inode *inode)
        /* Testing ip_orphaned_slot here wouldn't work because we may
         * not have gotten a delete_inode vote from any other nodes
         * yet. */
-        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
+        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
-                mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
+                generic_delete_inode(inode);
-                inode->i_nlink = 0;
+        else
-        }
+                generic_drop_inode(inode);
-        generic_drop_inode(inode);
        mlog_exit_void();
 }
@@ -1184,8 +1257,6 @@ leave:
 void ocfs2_refresh_inode(struct inode *inode,
                         struct ocfs2_dinode *fe)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1196,7 +1267,6 @@ void ocfs2_refresh_inode(struct inode *inode,
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
        inode->i_mode = le16_to_cpu(fe->i_mode);
-        inode->i_blksize = (u32) osb->s_clustersize;
        if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
                inode->i_blocks = 0;
        else
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 4d1e53992566..9957810fdf85 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -122,7 +122,13 @@ struct buffer_head *ocfs2_bread(struct inode *inode, int block,
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
-struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
+/* Flags for ocfs2_iget() */
+#define OCFS2_FI_FLAG_NOWAIT    0x1
+#define OCFS2_FI_FLAG_DELETE    0x2
+#define OCFS2_FI_FLAG_SYSFILE   0x4
+#define OCFS2_FI_FLAG_NOLOCK    0x8
+struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
 struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
                                     u64 blkno,
                                     int delete_vote);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f92bf1dd379a..fd9734def551 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1493,7 +1493,8 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                        if (de->name_len == 2 && !strncmp("..", de->name, 2))
                                continue;
-                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
+                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
+                                          OCFS2_FI_FLAG_NOLOCK);
                        if (IS_ERR(iter))
                                continue;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0d3e939b1f56..849c3b4bb94a 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -179,7 +179,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        if (status < 0)
                goto bail_add;
-        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
+        inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
        if (IS_ERR(inode)) {
                mlog(ML_ERROR, "Unable to create inode %llu\n",
                     (unsigned long long)blkno);
@@ -199,10 +199,32 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        spin_unlock(&oi->ip_lock);
 bail_add:
        dentry->d_op = &ocfs2_dentry_ops;
        ret = d_splice_alias(inode, dentry);
+        if (inode) {
+                /*
+                 * If d_splice_alias() finds a DCACHE_DISCONNECTED
+                 * dentry, it will d_move() it on top of ourse. The
+                 * return value will indicate this however, so in
+                 * those cases, we switch them around for the locking
+                 * code.
+                 *
+                 * NOTE: This dentry already has ->d_op set from
+                 * ocfs2_get_parent() and ocfs2_get_dentry()
+                 */
+                if (ret)
+                        dentry = ret;
+                status = ocfs2_dentry_attach_lock(dentry, inode,
+                                                  OCFS2_I(dir)->ip_blkno);
+                if (status) {
+                        mlog_errno(status);
+                        ret = ERR_PTR(status);
+                        goto bail_unlock;
+                }
+        }
 bail_unlock:
        /* Don't drop the cluster lock until *after* the d_add --
         * unlink on another node will message us to remove that
@@ -418,6 +440,13 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        status = ocfs2_dentry_attach_lock(dentry, inode,
+                                          OCFS2_I(dir)->ip_blkno);
+        if (status) {
+                mlog_errno(status);
+                goto leave;
+        }
        insert_inode_hash(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
@@ -725,6 +754,12 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto bail;
        }
+        err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        if (err) {
+                mlog_errno(err);
+                goto bail;
+        }
        atomic_inc(&inode->i_count);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
@@ -743,6 +778,23 @@ bail:
        return err;
 }
+/*
+ * Takes and drops an exclusive lock on the given dentry. This will
+ * force other nodes to drop it.
+ */
+static int ocfs2_remote_dentry_delete(struct dentry *dentry)
+{
+        int ret;
+        ret = ocfs2_dentry_lock(dentry, 1);
+        if (ret)
+                mlog_errno(ret);
+        else
+                ocfs2_dentry_unlock(dentry, 1);
+        return ret;
+}
 static int ocfs2_unlink(struct inode *dir,
                        struct dentry *dentry)
 {
@@ -832,8 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
        else
                inode->i_nlink--;
-        status = ocfs2_request_unlink_vote(inode, dentry,
+        status = ocfs2_remote_dentry_delete(dentry);
-                                           (unsigned int) inode->i_nlink);
        if (status < 0) {
                /* This vote should succeed under all normal
                 * circumstances. */
@@ -1019,7 +1070,6 @@ static int ocfs2_rename(struct inode *old_dir,
        struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
                                                    // this is the 1st dirent bh
        nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
-        unsigned int links_count;
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1093,23 +1143,26 @@ static int ocfs2_rename(struct inode *old_dir,
                }
        }
-        if (S_ISDIR(old_inode->i_mode)) {
+        /*
-                /* Directories actually require metadata updates to
+         * Though we don't require an inode meta data update if
-                 * the directory info so we can't get away with not
+         * old_inode is not a directory, we lock anyway here to ensure
-                 * doing node locking on it. */
+         * the vote thread on other nodes won't have to concurrently
-                status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
+         * downconvert the inode and the dentry locks.
-                if (status < 0) {
+         */
-                        if (status != -ENOENT)
+        status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
-                                mlog_errno(status);
+        if (status < 0) {
-                        goto bail;
+                if (status != -ENOENT)
-                }
-                status = ocfs2_request_rename_vote(old_inode, old_dentry);
-                if (status < 0) {
                        mlog_errno(status);
-                        goto bail;
+                goto bail;
-                }
+        }
+        status = ocfs2_remote_dentry_delete(old_dentry);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        if (S_ISDIR(old_inode->i_mode)) {
                status = -EIO;
                old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
                if (!old_inode_de_bh)
@@ -1123,14 +1176,6 @@ static int ocfs2_rename(struct inode *old_dir,
                if (!new_inode && new_dir!=old_dir &&
                    new_dir->i_nlink >= OCFS2_LINK_MAX)
                        goto bail;
-        } else {
-                /* Ah, the simple case - we're a file so just send a
-                 * message. */
-                status = ocfs2_request_rename_vote(old_inode, old_dentry);
-                if (status < 0) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        }
        status = -ENOENT;
@@ -1202,13 +1247,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        goto bail;
                }
-                if (S_ISDIR(new_inode->i_mode))
+                status = ocfs2_remote_dentry_delete(new_dentry);
-                        links_count = 0;
-                else
-                        links_count = (unsigned int) (new_inode->i_nlink - 1);
-                status = ocfs2_request_unlink_vote(new_inode, new_dentry,
-                                                   links_count);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1387,6 +1426,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
        }
+        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
        if (rename_lock)
@@ -1675,6 +1715,12 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
+        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        if (status) {
+                mlog_errno(status);
+                goto bail;
+        }
        insert_inode_hash(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 7dd9e1e705b0..4d5d5655c185 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -35,12 +35,15 @@
 #define OCFS2_LOCK_ID_MAX_LEN  32
 #define OCFS2_LOCK_ID_PAD "000000"
+#define OCFS2_DENTRY_LOCK_INO_START 18
 enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_META = 0,
        OCFS2_LOCK_TYPE_DATA,
        OCFS2_LOCK_TYPE_SUPER,
        OCFS2_LOCK_TYPE_RENAME,
        OCFS2_LOCK_TYPE_RW,
+        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -63,6 +66,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_RW:
                        c = 'W';
                        break;
+                case OCFS2_LOCK_TYPE_DENTRY:
+                        c = 'N';
+                        break;
                default:
                        c = '\0';
        }
@@ -70,4 +76,23 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
        return c;
 }
+static char *ocfs2_lock_type_strings[] = {
+        [OCFS2_LOCK_TYPE_META] = "Meta",
+        [OCFS2_LOCK_TYPE_DATA] = "Data",
+        [OCFS2_LOCK_TYPE_SUPER] = "Super",
+        [OCFS2_LOCK_TYPE_RENAME] = "Rename",
+        /* Need to differntiate from [R]ename.. serializing writes is the
+         * important job it does, anyway. */
+        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
+        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+};
+static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
+{
+#ifdef __KERNEL__
+        mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+#endif
+        return ocfs2_lock_type_strings[type];
+}
 #endif  /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d17e33e66a1e..4c29cd7cc8e6 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -202,7 +202,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        mlog_entry_void();
-        new = ocfs2_iget(osb, osb->root_blkno);
+        new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
                mlog_errno(status);
@@ -210,7 +210,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        }
        osb->root_inode = new;
-        new = ocfs2_iget(osb, osb->system_dir_blkno);
+        new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
                mlog_errno(status);
@@ -682,7 +682,7 @@ static struct file_system_type ocfs2_fs_type = {
        .kill_sb        = kill_block_super, /* set to the generic one
                                             * right now, but do we
                                             * need to change that? */
-        .fs_flags       = FS_REQUIRES_DEV,
+        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
        .next           = NULL
 };
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index fc29cb7a437d..5df6e35d09b1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -28,11 +28,11 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include "ocfs2.h"
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
+#include "ocfs2.h"
 #include "alloc.h"
 #include "dir.h"
 #include "inode.h"
@@ -115,7 +115,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                goto bail;
        }
-        inode = ocfs2_iget(osb, blkno);
+        inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE);
        if (IS_ERR(inode)) {
                mlog_errno(PTR_ERR(inode));
                inode = NULL;
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index cf70fe2075b8..5b4dca79990b 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -74,9 +74,6 @@ struct ocfs2_vote_msg
                __be32 v_orphaned_slot; /* Used during delete votes */
                __be32 v_nlink;         /* Used during unlink votes */
        } md1;                          /* Message type dependant 1 */
-        __be32 v_unlink_namelen;
-        __be64 v_unlink_parent;
-        u8  v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
 };
 /* Responses are given these values to maintain backwards
@@ -100,8 +97,6 @@ struct ocfs2_vote_work {
 enum ocfs2_vote_request {
        OCFS2_VOTE_REQ_INVALID = 0,
        OCFS2_VOTE_REQ_DELETE,
-        OCFS2_VOTE_REQ_UNLINK,
-        OCFS2_VOTE_REQ_RENAME,
        OCFS2_VOTE_REQ_MOUNT,
        OCFS2_VOTE_REQ_UMOUNT,
        OCFS2_VOTE_REQ_LAST
@@ -261,103 +256,13 @@ done:
        return response;
 }
-static int ocfs2_match_dentry(struct dentry *dentry,
-                              u64 parent_blkno,
-                              unsigned int namelen,
-                              const char *name)
-{
-        struct inode *parent;
-        if (!dentry->d_parent) {
-                mlog(0, "Detached from parent.\n");
-                return 0;
-        }
-        parent = dentry->d_parent->d_inode;
-        /* Negative parent dentry? */
-        if (!parent)
-                return 0;
-        /* Name is in a different directory. */
-        if (OCFS2_I(parent)->ip_blkno != parent_blkno)
-                return 0;
-        if (dentry->d_name.len != namelen)
-                return 0;
-        /* comparison above guarantees this is safe. */
-        if (memcmp(dentry->d_name.name, name, namelen))
-                return 0;
-        return 1;
-}
-static void ocfs2_process_dentry_request(struct inode *inode,
-                                         int rename,
-                                         unsigned int new_nlink,
-                                         u64 parent_blkno,
-                                         unsigned int namelen,
-                                         const char *name)
-{
-        struct dentry *dentry = NULL;
-        struct list_head *p;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog(0, "parent %llu, namelen = %u, name = %.*s\n",
-             (unsigned long long)parent_blkno, namelen, namelen, name);
-        spin_lock(&dcache_lock);
-        /* Another node is removing this name from the system. It is
-         * up to us to find the corresponding dentry and if it exists,
-         * unhash it from the dcache. */
-        list_for_each(p, &inode->i_dentry) {
-                dentry = list_entry(p, struct dentry, d_alias);
-                if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
-                        mlog(0, "dentry found: %.*s\n",
-                             dentry->d_name.len, dentry->d_name.name);
-                        dget_locked(dentry);
-                        break;
-                }
-                dentry = NULL;
-        }
-        spin_unlock(&dcache_lock);
-        if (dentry) {
-                d_delete(dentry);
-                dput(dentry);
-        }
-        /* rename votes don't send link counts */
-        if (!rename) {
-                mlog(0, "new_nlink = %u\n", new_nlink);
-                /* We don't have the proper locks here to directly
-                 * change i_nlink and besides, the vote is sent
-                 * *before* the operation so it may have failed on the
-                 * other node. This passes a hint to ocfs2_drop_inode
-                 * to force ocfs2_delete_inode, who will take the
-                 * proper cluster locks to sort things out. */
-                if (new_nlink == 0) {
-                        spin_lock(&oi->ip_lock);
-                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-                        spin_unlock(&OCFS2_I(inode)->ip_lock);
-                }
-        }
-}
 static void ocfs2_process_vote(struct ocfs2_super *osb,
                               struct ocfs2_vote_msg *msg)
 {
        int net_status, vote_response;
        int orphaned_slot = 0;
-        int rename = 0;
+        unsigned int node_num, generation;
-        unsigned int node_num, generation, new_nlink, namelen;
+        u64 blkno;
-        u64 blkno, parent_blkno;
        enum ocfs2_vote_request request;
        struct inode *inode = NULL;
        struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
@@ -437,18 +342,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
                vote_response = ocfs2_process_delete_request(inode,
                                                             &orphaned_slot);
                break;
-        case OCFS2_VOTE_REQ_RENAME:
-                rename = 1;
-                /* fall through */
-        case OCFS2_VOTE_REQ_UNLINK:
-                parent_blkno = be64_to_cpu(msg->v_unlink_parent);
-                namelen = be32_to_cpu(msg->v_unlink_namelen);
-                /* new_nlink will be ignored in case of a rename vote */
-                new_nlink = be32_to_cpu(msg->md1.v_nlink);
-                ocfs2_process_dentry_request(inode, rename, new_nlink,
-                                             parent_blkno, namelen,
-                                             msg->v_unlink_dirent);
-                break;
        default:
                mlog(ML_ERROR, "node %u, invalid request: %u\n",
                     node_num, request);
@@ -889,75 +782,6 @@ int ocfs2_request_delete_vote(struct inode *inode)
        return status;
 }
-static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
-                                    struct dentry *dentry)
-{
-        struct inode *parent = dentry->d_parent->d_inode;
-        /* We need some values which will uniquely identify a dentry
-         * on the other nodes so that they can find it and run
-         * d_delete against it. Parent directory block and full name
-         * should suffice. */
-        mlog(0, "unlink/rename request: parent: %llu name: %.*s\n",
-             (unsigned long long)OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
-             dentry->d_name.name);
-        request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
-        request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
-        memcpy(request->v_unlink_dirent, dentry->d_name.name,
-               dentry->d_name.len);
-}
-int ocfs2_request_unlink_vote(struct inode *inode,
-                              struct dentry *dentry,
-                              unsigned int nlink)
-{
-        int status;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_vote_msg *request;
-        if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
-                return -ENAMETOOLONG;
-        status = -ENOMEM;
-        request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-                                         inode->i_generation,
-                                         OCFS2_VOTE_REQ_UNLINK, nlink);
-        if (request) {
-                ocfs2_setup_unlink_vote(request, dentry);
-                status = ocfs2_request_vote(inode, request, NULL);
-                kfree(request);
-        }
-        return status;
-}
-int ocfs2_request_rename_vote(struct inode *inode,
-                              struct dentry *dentry)
-{
-        int status;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_vote_msg *request;
-        if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
-                return -ENAMETOOLONG;
-        status = -ENOMEM;
-        request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-                                         inode->i_generation,
-                                         OCFS2_VOTE_REQ_RENAME, 0);
-        if (request) {
-                ocfs2_setup_unlink_vote(request, dentry);
-                status = ocfs2_request_vote(inode, request, NULL);
-                kfree(request);
-        }
-        return status;
-}
 int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 {
        int status;
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 9cce60703466..53ebc1c69e56 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -39,11 +39,6 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
 }
 int ocfs2_request_delete_vote(struct inode *inode);
-int ocfs2_request_unlink_vote(struct inode *inode,
-                              struct dentry *dentry,
-                              unsigned int nlink);
-int ocfs2_request_rename_vote(struct inode *inode,
-                              struct dentry *dentry);
 int ocfs2_request_mount_vote(struct ocfs2_super *osb);
 int ocfs2_request_umount_vote(struct ocfs2_super *osb);
 int ocfs2_register_net_handlers(struct ocfs2_super *osb);
diff --git a/fs/open.c b/fs/open.c
index 303f06d2a7b9..304c1c7814cb 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -546,7 +546,8 @@ asmlinkage long sys_chdir(const char __user * filename)
        struct nameidata nd;
        int error;
-        error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+        error = __user_walk(filename,
+                            LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd);
        if (error)
                goto out;
@@ -1172,6 +1173,7 @@ asmlinkage long sys_close(unsigned int fd)
        struct file * filp;
        struct files_struct *files = current->files;
        struct fdtable *fdt;
+        int retval;
        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
@@ -1184,7 +1186,16 @@ asmlinkage long sys_close(unsigned int fd)
        FD_CLR(fd, fdt->close_on_exec);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
-        return filp_close(filp, files);
+        retval = filp_close(filp, files);
+        /* can't restart close syscall because file table entry was cleared */
+        if (unlikely(retval == -ERESTARTSYS ||
+                     retval == -ERESTARTNOINTR ||
+                     retval == -ERESTARTNOHAND ||
+                     retval == -ERESTART_RESTARTBLOCK))
+                retval = -EINTR;
+        return retval;
 out_unlock:
        spin_unlock(&files->file_lock);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 93a56bd4a2b7..592a6402e851 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -8,10 +8,10 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/openprom_fs.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
+#include <linux/magic.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 63730282ad81..1bea610078b3 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -238,10 +238,9 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
                le32_to_cpu(gpt->sizeof_partition_entry);
        if (!count)
                return NULL;
-        pte = kmalloc(count, GFP_KERNEL);
+        pte = kzalloc(count, GFP_KERNEL);
        if (!pte)
                return NULL;
-        memset(pte, 0, count);
        if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
                     (u8 *) pte,
@@ -269,10 +268,9 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
        if (!bdev)
                return NULL;
-        gpt = kmalloc(sizeof (gpt_header), GFP_KERNEL);
+        gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
        if (!gpt)
                return NULL;
-        memset(gpt, 0, sizeof (gpt_header));
        if (read_lba(bdev, lba, (u8 *) gpt,
                     sizeof (gpt_header)) < sizeof (gpt_header)) {
@@ -526,9 +524,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
        lastlba = last_lba(bdev);
        if (!force_gpt) {
                /* This will be added to the EFI Spec. per Intel after v1.02. */
-                legacymbr = kmalloc(sizeof (*legacymbr), GFP_KERNEL);
+                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                if (legacymbr) {
-                        memset(legacymbr, 0, sizeof (*legacymbr));
                        read_lba(bdev, 0, (u8 *) legacymbr,
                                 sizeof (*legacymbr));
                        good_pmbr = is_pmbr_valid(legacymbr, lastlba);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 8f12587c3129..4f8df71e49d3 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -58,6 +58,31 @@ msdos_magic_present(unsigned char *p)
        return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
 }
+/* Value is EBCDIC 'IBMA' */
+#define AIX_LABEL_MAGIC1        0xC9
+#define AIX_LABEL_MAGIC2        0xC2
+#define AIX_LABEL_MAGIC3        0xD4
+#define AIX_LABEL_MAGIC4        0xC1
+static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+{
+        Sector sect;
+        unsigned char *d;
+        int ret = 0;
+        if (p[0] != AIX_LABEL_MAGIC1 &&
+                p[1] != AIX_LABEL_MAGIC2 &&
+                p[2] != AIX_LABEL_MAGIC3 &&
+                p[3] != AIX_LABEL_MAGIC4)
+                return 0;
+        d = read_dev_sector(bdev, 7, &sect);
+        if (d) {
+                if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
+                        ret = 1;
+                put_dev_sector(sect);
+        };
+        return ret;
+}
 /*
 * Create devices for each logical partition in an extended partition.
 * The logical partitions form a linked list, with each entry being
@@ -393,6 +418,12 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
                return 0;
        }
+        if (aix_magic_present(data, bdev)) {
+                put_dev_sector(sect);
+                printk( " [AIX]");
+                return 0;
+        }
        /*
         * Now that the 55aa signature is present, this is probably
         * either the boot sector of a FAT filesystem or a DOS-type
diff --git a/fs/pipe.c b/fs/pipe.c
index 20352573e025..f3b6f71e9d0b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -879,7 +879,6 @@ static struct inode * get_pipe_inode(void)
        inode->i_uid = current->fsuid;
        inode->i_gid = current->fsgid;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blksize = PAGE_SIZE;
        return inode;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0b615d62a159..c0e554971df0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -347,6 +347,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        sigemptyset(&sigign);
        sigemptyset(&sigcatch);
        cutime = cstime = utime = stime = cputime_zero;
+        mutex_lock(&tty_mutex);
        read_lock(&tasklist_lock);
        if (task->sighand) {
                spin_lock_irq(&task->sighand->siglock);
@@ -388,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
        }
        ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
        read_unlock(&tasklist_lock);
+        mutex_unlock(&tty_mutex);
        if (!whole || num_threads<2)
                wchan = get_wchan(task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fe8d55fb17cc..89c20d9d50bf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -797,7 +797,7 @@ out_no_task:
 static ssize_t mem_write(struct file * file, const char * buf,
                         size_t count, loff_t *ppos)
 {
-        int copied = 0;
+        int copied;
        char *page;
        struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
        unsigned long dst = *ppos;
@@ -814,6 +814,7 @@ static ssize_t mem_write(struct file * file, const char * buf,
        if (!page)
                goto out;
+        copied = 0;
        while (count > 0) {
                int this_len, retval;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 146a434ba944..987c773dbb20 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -28,6 +28,7 @@ do {						\
        (vmi)->largest_chunk = 0;               \
 } while(0)
+extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6a984f64edd7..1294eda4acae 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -100,7 +100,7 @@ static int notesize(struct memelfnote *en)
        int sz;
        sz = sizeof(struct elf_note);
-        sz += roundup(strlen(en->name), 4);
+        sz += roundup((strlen(en->name) + 1), 4);
        sz += roundup(en->datasz, 4);
        return sz;
@@ -116,7 +116,7 @@ static char *storenote(struct memelfnote *men, char *bufp)
 #define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
-        en.n_namesz = strlen(men->name);
+        en.n_namesz = strlen(men->name) + 1;
        en.n_descsz = men->datasz;
        en.n_type = men->type;
@@ -279,12 +279,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                tsz = elf_buflen - *fpos;
                if (buflen < tsz)
                        tsz = buflen;
-                elf_buf = kmalloc(elf_buflen, GFP_ATOMIC);
+                elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
                if (!elf_buf) {
                        read_unlock(&kclist_lock);
                        return -ENOMEM;
                }
-                memset(elf_buf, 0, elf_buflen);
                elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
                read_unlock(&kclist_lock);
                if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
@@ -330,10 +329,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
                        unsigned long curstart = start;
                        unsigned long cursize = tsz;
-                        elf_buf = kmalloc(tsz, GFP_KERNEL);
+                        elf_buf = kzalloc(tsz, GFP_KERNEL);
                        if (!elf_buf)
                                return -ENOMEM;
-                        memset(elf_buf, 0, tsz);
                        read_lock(&vmlist_lock);
                        for (m=vmlist; m && cursize; m=m->next) {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index cff10ab1af63..d7dbdf9e0f49 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,19 +33,15 @@
 #include "internal.h"
 /*
- * display a list of all the VMAs the kernel knows about
+ * display a single VMA to a sequenced file
- * - nommu kernals have a single flat list
 */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *vma;
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
        int flags, len;
-        vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
        flags = vma->vm_flags;
        file = vma->vm_file;
@@ -78,6 +74,18 @@ static int nommu_vma_list_show(struct seq_file *m, void *v)
        return 0;
 }
+/*
+ * display a list of all the VMAs the kernel knows about
+ * - nommu kernals have a single flat list
+ */
+static int nommu_vma_list_show(struct seq_file *m, void *v)
+{
+        struct vm_area_struct *vma;
+        vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
+        return nommu_vma_show(m, vma);
+}
 static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
 {
        struct rb_node *_rb;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 942156225447..5bbd60896050 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -157,10 +157,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                "SwapCached:   %8lu kB\n"
                "Active:       %8lu kB\n"
                "Inactive:     %8lu kB\n"
+#ifdef CONFIG_HIGHMEM
                "HighTotal:    %8lu kB\n"
                "HighFree:     %8lu kB\n"
                "LowTotal:     %8lu kB\n"
                "LowFree:      %8lu kB\n"
+#endif
                "SwapTotal:    %8lu kB\n"
                "SwapFree:     %8lu kB\n"
                "Dirty:        %8lu kB\n"
@@ -168,6 +170,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                "AnonPages:    %8lu kB\n"
                "Mapped:       %8lu kB\n"
                "Slab:         %8lu kB\n"
+                "SReclaimable: %8lu kB\n"
+                "SUnreclaim:   %8lu kB\n"
                "PageTables:   %8lu kB\n"
                "NFS_Unstable: %8lu kB\n"
                "Bounce:       %8lu kB\n"
@@ -183,17 +187,22 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                K(total_swapcache_pages),
                K(active),
                K(inactive),
+#ifdef CONFIG_HIGHMEM
                K(i.totalhigh),
                K(i.freehigh),
                K(i.totalram-i.totalhigh),
                K(i.freeram-i.freehigh),
+#endif
                K(i.totalswap),
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
                K(global_page_state(NR_WRITEBACK)),
                K(global_page_state(NR_ANON_PAGES)),
                K(global_page_state(NR_FILE_MAPPED)),
-                K(global_page_state(NR_SLAB)),
+                K(global_page_state(NR_SLAB_RECLAIMABLE) +
+                                global_page_state(NR_SLAB_UNRECLAIMABLE)),
+                K(global_page_state(NR_SLAB_RECLAIMABLE)),
+                K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_PAGETABLE)),
                K(global_page_state(NR_UNSTABLE_NFS)),
                K(global_page_state(NR_BOUNCE)),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0a163a4f7764..6b769afac55a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,11 +122,6 @@ struct mem_size_stats
        unsigned long private_dirty;
 };
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
-{
-        return NULL;
-}
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
        struct proc_maps_private *priv = m->private;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4616ed50ffcd..091aa8e48e02 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -138,25 +138,63 @@ out:
 }
 /*
- * Albert D. Cahalan suggested to fake entries for the traditional
+ * display mapping lines for a particular process's /proc/pid/maps
- * sections here.  This might be worth investigating.
 */
-static int show_map(struct seq_file *m, void *v)
+static int show_map(struct seq_file *m, void *_vml)
 {
-        return 0;
+        struct vm_list_struct *vml = _vml;
+        return nommu_vma_show(m, vml->vma);
 }
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
+        struct proc_maps_private *priv = m->private;
+        struct vm_list_struct *vml;
+        struct mm_struct *mm;
+        loff_t n = *pos;
+        /* pin the task and mm whilst we play with them */
+        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+        if (!priv->task)
+                return NULL;
+        mm = get_task_mm(priv->task);
+        if (!mm) {
+                put_task_struct(priv->task);
+                priv->task = NULL;
+                return NULL;
+        }
+        down_read(&mm->mmap_sem);
+        /* start from the Nth VMA */
+        for (vml = mm->context.vmlist; vml; vml = vml->next)
+                if (n-- == 0)
+                        return vml;
        return NULL;
 }
-static void m_stop(struct seq_file *m, void *v)
+static void m_stop(struct seq_file *m, void *_vml)
 {
+        struct proc_maps_private *priv = m->private;
+        if (priv->task) {
+                struct mm_struct *mm = priv->task->mm;
+                up_read(&mm->mmap_sem);
+                mmput(mm);
+                put_task_struct(priv->task);
+        }
 }
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
 {
-        return NULL;
+        struct vm_list_struct *vml = _vml;
+        (*pos)++;
+        return vml ? vml->next : NULL;
 }
-static struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_ops = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
@@ -165,11 +203,19 @@ static struct seq_operations proc_pid_maps_op = {
 static int maps_open(struct inode *inode, struct file *file)
 {
-        int ret;
+        struct proc_maps_private *priv;
-        ret = seq_open(file, &proc_pid_maps_op);
+        int ret = -ENOMEM;
-        if (!ret) {
-                struct seq_file *m = file->private_data;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
-                m->private = NULL;
+        if (priv) {
+                priv->pid = proc_pid(inode);
+                ret = seq_open(file, &proc_pid_maps_ops);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        m->private = priv;
+                } else {
+                        kfree(priv);
+                }
        }
        return ret;
 }
@@ -178,6 +224,6 @@ struct file_operations proc_maps_operations = {
        .open           = maps_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_private,
 };
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 5a903491e697..5a41db2a218d 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -358,11 +358,10 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        const char *errmsg;
        struct qnx4_sb_info *qs;
-        qs = kmalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
+        qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
        if (!qs)
                return -ENOMEM;
        s->s_fs_info = qs;
-        memset(qs, 0, sizeof(struct qnx4_sb_info));
        sb_set_blocksize(s, QNX4_BLOCK_SIZE);
@@ -497,7 +496,6 @@ static void qnx4_read_inode(struct inode *inode)
        inode->i_ctime.tv_sec   = le32_to_cpu(raw_inode->di_ctime);
        inode->i_ctime.tv_nsec = 0;
        inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
-        inode->i_blksize = QNX4_DIR_ENTRY_SIZE;
        memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
        if (S_ISREG(inode->i_mode)) {
@@ -557,9 +555,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(qnx4_inode_cachep))
+        kmem_cache_destroy(qnx4_inode_cachep);
-                printk(KERN_INFO
-                       "qnx4_inode_cache: not all structures were freed\n");
 }
 static int qnx4_get_sb(struct file_system_type *fs_type,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b9677335cc8d..bc0e51662424 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,7 +58,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_mode = mode;
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 3a59309f3ca9..0eb7ac080484 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -28,7 +28,7 @@ endif
 # will work around it. If any other architecture displays this behavior,
 # add it here.
 ifeq ($(CONFIG_PPC32),y)
-EXTRA_CFLAGS := -O1
+EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
 endif
 TAGS:
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1627edd50810..1cfbe857ba27 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -130,7 +130,7 @@ static int reiserfs_sync_file(struct file *p_s_filp,
        reiserfs_write_lock(p_s_inode->i_sb);
        barrier_done = reiserfs_commit_for_inode(p_s_inode);
        reiserfs_write_unlock(p_s_inode->i_sb);
-        if (barrier_done != 1)
+        if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
                blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
        if (barrier_done < 0)
                return barrier_done;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 52f1e2136546..7e5a2f5ebeb0 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -17,8 +17,6 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
-extern int reiserfs_default_io_size;    /* default io size devuned in super.c */
 static int reiserfs_commit_write(struct file *f, struct page *page,
                                 unsigned from, unsigned to);
 static int reiserfs_prepare_write(struct file *f, struct page *page,
@@ -1122,7 +1120,6 @@ static void init_inode(struct inode *inode, struct path *path)
        ih = PATH_PITEM_HEAD(path);
        copy_key(INODE_PKEY(inode), &(ih->ih_key));
-        inode->i_blksize = reiserfs_default_io_size;
        INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
        REISERFS_I(inode)->i_flags = 0;
@@ -1130,9 +1127,9 @@ static void init_inode(struct inode *inode, struct path *path)
        REISERFS_I(inode)->i_prealloc_count = 0;
        REISERFS_I(inode)->i_trans_id = 0;
        REISERFS_I(inode)->i_jl = NULL;
-        REISERFS_I(inode)->i_acl_access = NULL;
+        reiserfs_init_acl_access(inode);
-        REISERFS_I(inode)->i_acl_default = NULL;
+        reiserfs_init_acl_default(inode);
-        init_rwsem(&REISERFS_I(inode)->xattr_sem);
+        reiserfs_init_xattr_rwsem(inode);
        if (stat_data_v1(ih)) {
                struct stat_data_v1 *sd =
@@ -1837,9 +1834,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        REISERFS_I(inode)->i_attrs =
            REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
        sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-        REISERFS_I(inode)->i_acl_access = NULL;
+        reiserfs_init_acl_access(inode);
-        REISERFS_I(inode)->i_acl_default = NULL;
+        reiserfs_init_acl_default(inode);
-        init_rwsem(&REISERFS_I(inode)->xattr_sem);
+        reiserfs_init_xattr_rwsem(inode);
        if (old_format_only(sb))
                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
@@ -1877,7 +1874,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        }
        // these do not go to on-disk stat data
        inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-        inode->i_blksize = reiserfs_default_io_size;
        // store in in-core inode the key of stat data and version all
        // object items will have (directory items will have old offset
@@ -1978,11 +1974,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
         * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
         * code really needs to be reworked, but this will take care of it
         * for now. -jeffm */
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
        if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
                reiserfs_write_unlock_xattrs(dir->i_sb);
                iput(inode);
                reiserfs_write_lock_xattrs(dir->i_sb);
        } else
+#endif
                iput(inode);
        return err;
 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9b3672d69367..e6b5ccf23f15 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1186,6 +1186,21 @@ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
        return NULL;
 }
+static int newer_jl_done(struct reiserfs_journal_cnode *cn)
+{
+        struct super_block *sb = cn->sb;
+        b_blocknr_t blocknr = cn->blocknr;
+        cn = cn->hprev;
+        while (cn) {
+                if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
+                    atomic_read(&cn->jlist->j_commit_left) != 0)
+                                    return 0;
+                cn = cn->hprev;
+        }
+        return 1;
+}
 static void remove_journal_hash(struct super_block *,
                                struct reiserfs_journal_cnode **,
                                struct reiserfs_journal_list *, unsigned long,
@@ -1604,6 +1619,31 @@ static int flush_journal_list(struct super_block *s,
        return err;
 }
+static int test_transaction(struct super_block *s,
+                            struct reiserfs_journal_list *jl)
+{
+        struct reiserfs_journal_cnode *cn;
+        if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
+                return 1;
+        cn = jl->j_realblock;
+        while (cn) {
+                /* if the blocknr == 0, this has been cleared from the hash,
+                 ** skip it
+                 */
+                if (cn->blocknr == 0) {
+                        goto next;
+                }
+                if (cn->bh && !newer_jl_done(cn))
+                        return 0;
+              next:
+                cn = cn->next;
+                cond_resched();
+        }
+        return 0;
+}
 static int write_one_transaction(struct super_block *s,
                                 struct reiserfs_journal_list *jl,
                                 struct buffer_chunk *chunk)
@@ -3433,16 +3473,6 @@ static void flush_async_commits(void *p)
                flush_commit_list(p_s_sb, jl, 1);
        }
        unlock_kernel();
-        /*
-         * this is a little racey, but there's no harm in missing
-         * the filemap_fdata_write
-         */
-        if (!atomic_read(&journal->j_async_throttle)
-            && !reiserfs_is_journal_aborted(journal)) {
-                atomic_inc(&journal->j_async_throttle);
-                filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
-                atomic_dec(&journal->j_async_throttle);
-        }
 }
 /*
@@ -3844,7 +3874,9 @@ static void flush_old_journal_lists(struct super_block *s)
                entry = journal->j_journal_list.next;
                jl = JOURNAL_LIST_ENTRY(entry);
                /* this check should always be run, to send old lists to disk */
-                if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+                if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
+                    atomic_read(&jl->j_commit_left) == 0 &&
+                    test_transaction(s, jl)) {
                        flush_used_journal_lists(s, jl);
                } else {
                        break;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5567328f1041..80fc3b32802f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -510,8 +510,10 @@ static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
            SLAB_CTOR_CONSTRUCTOR) {
                INIT_LIST_HEAD(&ei->i_prealloc_list);
                inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
                ei->i_acl_access = NULL;
                ei->i_acl_default = NULL;
+#endif
        }
 }
@@ -530,9 +532,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(reiserfs_inode_cachep))
+        kmem_cache_destroy(reiserfs_inode_cachep);
-                reiserfs_warning(NULL,
-                                 "reiserfs_inode_cache: not all structures were freed");
 }
 /* we don't mark inodes dirty, we just log them */
@@ -562,6 +562,7 @@ static void reiserfs_dirty_inode(struct inode *inode)
        reiserfs_write_unlock(inode->i_sb);
 }
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 static void reiserfs_clear_inode(struct inode *inode)
 {
        struct posix_acl *acl;
@@ -576,6 +577,9 @@ static void reiserfs_clear_inode(struct inode *inode)
                posix_acl_release(acl);
        REISERFS_I(inode)->i_acl_default = NULL;
 }
+#else
+#define reiserfs_clear_inode NULL
+#endif
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
@@ -725,12 +729,6 @@ static const arg_desc_t error_actions[] = {
        {NULL, 0, 0},
 };
-int reiserfs_default_io_size = 128 * 1024;      /* Default recommended I/O size is 128k.
-                                                   There might be broken applications that are
-                                                   confused by this. Use nolargeio mount option
-                                                   to get usual i/o size = PAGE_SIZE.
-                                                 */
 /* proceed only one option from a list *cur - string containing of mount options
   opts - array of options which are accepted
   opt_arg - if option is found and requires an argument and if it is specifed
@@ -959,19 +957,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                }
                if (c == 'w') {
-                        char *p = NULL;
+                        reiserfs_warning(s, "reiserfs: nolargeio option is no longer supported");
-                        int val = simple_strtoul(arg, &p, 0);
+                        return 0;
-                        if (*p != '\0') {
-                                reiserfs_warning(s,
-                                                 "reiserfs_parse_options: non-numeric value %s for nolargeio option",
-                                                 arg);
-                                return 0;
-                        }
-                        if (val)
-                                reiserfs_default_io_size = PAGE_SIZE;
-                        else
-                                reiserfs_default_io_size = 128 * 1024;
                }
                if (c == 'j') {
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 22eed61ebf69..ddcd9e1ef282 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -589,8 +589,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(romfs_inode_cachep))
+        kmem_cache_destroy(romfs_inode_cachep);
-                printk(KERN_INFO "romfs_inode_cache: not all structures were freed\n");
 }
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/select.c b/fs/select.c
index 33b72ba0f86f..dcbc1112b7ec 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -658,8 +658,6 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
        unsigned int i;
        struct poll_list *head;
        struct poll_list *walk;
-        struct fdtable *fdt;
-        int max_fdset;
        /* Allocate small arguments on the stack to save memory and be
           faster - use long to make sure the buffer is aligned properly
           on 64 bit archs to avoid unaligned access */
@@ -667,11 +665,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
        struct poll_list *stack_pp = NULL;
        /* Do a sanity check on nfds ... */
-        rcu_read_lock();
+        if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-        fdt = files_fdtable(current->files);
-        max_fdset = fdt->max_fdset;
-        rcu_read_unlock();
-        if (nfds > max_fdset && nfds > OPEN_MAX)
                return -EINVAL;
        poll_initwait(&table);
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index a1ed657c3c84..2c122ee83adb 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -89,8 +89,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(smb_inode_cachep))
+        kmem_cache_destroy(smb_inode_cachep);
-                printk(KERN_INFO "smb_inode_cache: not all structures were freed\n");
 }
 static int smb_remount(struct super_block *sb, int *flags, char *data)
@@ -167,7 +166,6 @@ smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
        fattr->f_mtime  = inode->i_mtime;
        fattr->f_ctime  = inode->i_ctime;
        fattr->f_atime  = inode->i_atime;
-        fattr->f_blksize= inode->i_blksize;
        fattr->f_blocks = inode->i_blocks;
        fattr->attr     = SMB_I(inode)->attr;
@@ -201,7 +199,6 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
        inode->i_uid    = fattr->f_uid;
        inode->i_gid    = fattr->f_gid;
        inode->i_ctime  = fattr->f_ctime;
-        inode->i_blksize= fattr->f_blksize;
        inode->i_blocks = fattr->f_blocks;
        inode->i_size   = fattr->f_size;
        inode->i_mtime  = fattr->f_mtime;
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index c3495059889d..40e174db9872 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -1826,7 +1826,6 @@ smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
        fattr->f_nlink = 1;
        fattr->f_uid = server->mnt->uid;
        fattr->f_gid = server->mnt->gid;
-        fattr->f_blksize = SMB_ST_BLKSIZE;
        fattr->f_unix = 0;
 }
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c8e96195b96e..0fb74697abc4 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -49,8 +49,7 @@ int smb_init_request_cache(void)
 void smb_destroy_request_cache(void)
 {
-        if (kmem_cache_destroy(req_cachep))
+        kmem_cache_destroy(req_cachep);
-                printk(KERN_INFO "smb_destroy_request_cache: not all structures were freed\n");
 }
 /*
diff --git a/fs/stat.c b/fs/stat.c
index 3a44dcf97da2..60a31d5e5966 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -14,6 +14,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -32,7 +33,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
        stat->ctime = inode->i_ctime;
        stat->size = i_size_read(inode);
        stat->blocks = inode->i_blocks;
-        stat->blksize = inode->i_blksize;
+        stat->blksize = (1 << inode->i_blkbits);
 }
 EXPORT_SYMBOL(generic_fillattr);
diff --git a/fs/super.c b/fs/super.c
index 5c4c94d5495e..6987824d0dce 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(deactivate_super);
 *      success, 0 if we had failed (superblock contents was already dead or
 *      dying when grab_super() had been called).
 */
-static int grab_super(struct super_block *s)
+static int grab_super(struct super_block *s) __releases(sb_lock)
 {
        s->s_count++;
        spin_unlock(&sb_lock);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index c16a93c353c0..98022e41cda1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
 #include <linux/kobject.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -176,7 +177,6 @@ const struct file_operations bin_fops = {
 *      sysfs_create_bin_file - create binary file for object.
 *      @kobj:  object.
 *      @attr:  attribute descriptor.
- *
 */
 int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
@@ -191,13 +191,16 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 *      sysfs_remove_bin_file - remove binary file for object.
 *      @kobj:  object.
 *      @attr:  attribute descriptor.
- *
 */
-int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 {
-        sysfs_hash_and_remove(kobj->dentry,attr->attr.name);
+        if (sysfs_hash_and_remove(kobj->dentry, attr->attr.name) < 0) {
-        return 0;
+                printk(KERN_ERR "%s: "
+                        "bad dentry or inode or no such file: \"%s\"\n",
+                        __FUNCTION__, attr->attr.name);
+                dump_stack();
+        }
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 61c42430cba3..5f3d725d1125 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,7 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
        memset(sd, 0, sizeof(*sd));
        atomic_set(&sd->s_count, 1);
-        atomic_set(&sd->s_event, 0);
+        atomic_set(&sd->s_event, 1);
        INIT_LIST_HEAD(&sd->s_children);
        list_add(&sd->s_sibling, &parent_sd->s_children);
        sd->s_element = element;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 9889e54e1f13..e79e38d52c00 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -12,6 +12,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
+#include <linux/errno.h>
 #include "sysfs.h"
 extern struct super_block * sysfs_sb;
@@ -124,7 +125,6 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd)
 {
        struct inode * inode = new_inode(sysfs_sb);
        if (inode) {
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &sysfs_aops;
                inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
@@ -234,17 +234,18 @@ void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent)
        }
 }
-void sysfs_hash_and_remove(struct dentry * dir, const char * name)
+int sysfs_hash_and_remove(struct dentry * dir, const char * name)
 {
        struct sysfs_dirent * sd;
        struct sysfs_dirent * parent_sd;
+        int found = 0;
        if (!dir)
-                return;
+                return -ENOENT;
        if (dir->d_inode == NULL)
                /* no inode means this hasn't been made visible yet */
-                return;
+                return -ENOENT;
        parent_sd = dir->d_fsdata;
        mutex_lock(&dir->d_inode->i_mutex);
@@ -255,8 +256,11 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
                        list_del_init(&sd->s_sibling);
                        sysfs_drop_dentry(sd, dir);
                        sysfs_put(sd);
+                        found = 1;
                        break;
                }
        }
        mutex_unlock(&dir->d_inode->i_mutex);
+        return found ? 0 : -ENOENT;
 }
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index d2eac3ceed5f..f50e3cc2ded8 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -3,6 +3,7 @@
 */
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
@@ -82,10 +83,19 @@ exit1:
 */
 int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
 {
-        struct dentry * dentry = kobj->dentry;
+        struct dentry *dentry = NULL;
        int error = -EEXIST;
-        BUG_ON(!kobj || !kobj->dentry || !name);
+        BUG_ON(!name);
+        if (!kobj) {
+                if (sysfs_mount && sysfs_mount->mnt_sb)
+                        dentry = sysfs_mount->mnt_sb->s_root;
+        } else
+                dentry = kobj->dentry;
+        if (!dentry)
+                return -EFAULT;
        mutex_lock(&dentry->d_inode->i_mutex);
        if (!sysfs_dirent_exist(dentry->d_fsdata, name))
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3651ffb5ec09..6f3d6bd52887 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -10,7 +10,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
                                umode_t, int);
 extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
-extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern int sysfs_hash_and_remove(struct dentry * dir, const char * name);
 extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
 extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 9b585d1081c0..115ab0d6f4bc 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -170,7 +170,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
        inode->i_uid = current->fsuid;
        inode->i_ino = fs16_to_cpu(sbi, ino);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
        SYSV_I(inode)->i_dir_start_lookup = 0;
        insert_inode_hash(inode);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 58b2d22142ba..d63c5e48b050 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -201,7 +201,7 @@ static void sysv_read_inode(struct inode *inode)
        inode->i_ctime.tv_nsec = 0;
        inode->i_atime.tv_nsec = 0;
        inode->i_mtime.tv_nsec = 0;
-        inode->i_blocks = inode->i_blksize = 0;
+        inode->i_blocks = 0;
        si = SYSV_I(inode);
        for (block = 0; block < 10+1+1+1; block++)
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 876639b93321..350cba5d6803 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -369,10 +369,9 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
        if (64 != sizeof (struct sysv_inode))
                panic("sysv fs: bad inode size");
-        sbi = kmalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        memset(sbi, 0, sizeof(struct sysv_sb_info));
        sbi->s_sb = sb;
        sbi->s_block_base = 0;
@@ -453,10 +452,9 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
        if (64 != sizeof (struct sysv_inode))
                panic("sysv fs: bad i-node size");
-        sbi = kmalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
-        memset(sbi, 0, sizeof(struct sysv_sb_info));
        sbi->s_sb = sb;
        sbi->s_block_base = 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 33323473e3c4..8206983f2ebf 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -121,7 +121,6 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
        UDF_I_LOCATION(inode).logicalBlockNum = block;
        UDF_I_LOCATION(inode).partitionReferenceNum = UDF_I_LOCATION(dir).partitionReferenceNum;
        inode->i_ino = udf_get_lb_pblock(sb, UDF_I_LOCATION(inode), 0);
-        inode->i_blksize = PAGE_SIZE;
        inode->i_blocks = 0;
        UDF_I_LENEATTR(inode) = 0;
        UDF_I_LENALLOC(inode) = 0;
@@ -130,14 +129,12 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
        {
                UDF_I_EFE(inode) = 1;
                UDF_UPDATE_UDFREV(inode->i_sb, UDF_VERS_USE_EXTENDED_FE);
-                UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL);
+                UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL);
-                memset(UDF_I_DATA(inode), 0x00, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry));
        }
        else
        {
                UDF_I_EFE(inode) = 0;
-                UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
+                UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
-                memset(UDF_I_DATA(inode), 0x00, inode->i_sb->s_blocksize - sizeof(struct fileEntry));
        }
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 605f5111b6d8..b223b32db991 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -916,8 +916,6 @@ __udf_read_inode(struct inode *inode)
         *      i_nlink = 1
         *      i_op = NULL;
         */
-        inode->i_blksize = PAGE_SIZE;
        bh = udf_read_ptagged(inode->i_sb, UDF_I_LOCATION(inode), 0, &ident);
        if (!bh)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index fcce1a21a51b..1d3b5d2070e5 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -156,8 +156,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(udf_inode_cachep))
+        kmem_cache_destroy(udf_inode_cachep);
-                printk(KERN_INFO "udf_inode_cache: not all structures were freed\n");
 }
 /* Superblock operations */
@@ -1622,6 +1621,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                goto error_out;
        }
+        if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY)
+                printk("UDF-fs: Partition marked readonly; forcing readonly mount\n");
+                sb->s_flags |= MS_RDONLY;
        if ( udf_find_fileset(sb, &fileset, &rootdir) )
        {
                printk("UDF-fs: No fileset found\n");
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 9501dcd3b213..2ad1259c6eca 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -255,7 +255,6 @@ cg_found:
                inode->i_gid = current->fsgid;
        inode->i_ino = cg * uspi->s_ipg + bit;
-        inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat), not the fs block size */
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        ufsi->i_flags = UFS_I(dir)->i_flags;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 30c6e8a9446c..ee1eaa6f4ec2 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -741,7 +741,6 @@ void ufs_read_inode(struct inode * inode)
                ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
        }
-        inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
        inode->i_version++;
        ufsi->i_lastfrag =
                (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 992ee0b87cc3..ec79e3091d1b 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -611,11 +611,10 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        
        UFSD("ENTER\n");
                
-        sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
+        sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
        if (!sbi)
                goto failed_nomem;
        sb->s_fs_info = sbi;
-        memset(sbi, 0, sizeof(struct ufs_sb_info));
        UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
        
@@ -1245,8 +1244,7 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
-        if (kmem_cache_destroy(ufs_inode_cachep))
+        kmem_cache_destroy(ufs_inode_cachep);
-                printk(KERN_INFO "ufs_inode_cache: not all structures were freed\n");
 }
 #ifdef CONFIG_QUOTA
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 9e7f85986d0d..291948d5085a 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -30,7 +30,6 @@ ifeq ($(CONFIG_XFS_TRACE),y)
        EXTRA_CFLAGS += -DXFS_BLI_TRACE
        EXTRA_CFLAGS += -DXFS_BMAP_TRACE
        EXTRA_CFLAGS += -DXFS_BMBT_TRACE
-        EXTRA_CFLAGS += -DXFS_DIR_TRACE
        EXTRA_CFLAGS += -DXFS_DIR2_TRACE
        EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
        EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index aba7fcf881a2..d59737589815 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -34,6 +34,14 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
        gfp_t   lflags = kmem_flags_convert(flags);
        void    *ptr;
+#ifdef DEBUG
+        if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
+                printk(KERN_WARNING "Large %s attempt, size=%ld\n",
+                        __FUNCTION__, (long)size);
+                dump_stack();
+        }
+#endif
        do {
                if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
                        ptr = kmalloc(size, lflags);
@@ -60,6 +68,27 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
        return ptr;
 }
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
+                   unsigned int __nocast flags)
+{
+        void            *ptr;
+        size_t          kmsize = maxsize;
+        unsigned int    kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
+        while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
+                if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
+                        break;
+                if ((kmsize >>= 1) <= minsize) {
+                        kmsize = minsize;
+                        kmflags = flags;
+                }
+        }
+        if (ptr)
+                *size = kmsize;
+        return ptr;
+}
 void
 kmem_free(void *ptr, size_t size)
 {
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 939bd84bc7ee..9ebabdf7829c 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -30,6 +30,7 @@
 #define KM_NOSLEEP      0x0002u
 #define KM_NOFS         0x0004u
 #define KM_MAYFAIL      0x0008u
+#define KM_LARGE        0x0010u
 /*
 * We use a special process flag to avoid recursive callbacks into
@@ -41,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 {
        gfp_t   lflags;
-        BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+        BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE));
        if (flags & KM_NOSLEEP) {
                lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -54,8 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
 }
 extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
 extern void  kmem_free(void *, size_t);
 /*
@@ -91,8 +93,8 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr)
 static inline void
 kmem_zone_destroy(kmem_zone_t *zone)
 {
-        if (zone && kmem_cache_destroy(zone))
+        if (zone)
-                BUG();
+                kmem_cache_destroy(zone);
 }
 extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index b25090094cca..2009e6d922ce 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -29,8 +29,6 @@
 typedef struct semaphore sema_t;
-#define init_sema(sp, val, c, d)        sema_init(sp, val)
-#define initsema(sp, val)               sema_init(sp, val)
 #define initnsema(sp, val, name)        sema_init(sp, val)
 #define psema(sp, b)                    down(sp)
 #define vsema(sp)                       up(sp)
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 9a8ad481b008..351a8f454bd1 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -53,8 +53,6 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
        remove_wait_queue(&sv->waiters, &wait);
 }
-#define init_sv(sv,type,name,flag) \
-        init_waitqueue_head(&(sv)->waiters)
 #define sv_init(sv,flag,name) \
        init_waitqueue_head(&(sv)->waiters)
 #define sv_destroy(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34dcb43a7837..09360cf1e1f2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -71,7 +71,7 @@ xfs_page_trace(
        int             tag,
        struct inode    *inode,
        struct page     *page,
-        int             mask)
+        unsigned long   pgoff)
 {
        xfs_inode_t     *ip;
        bhv_vnode_t     *vp = vn_from_inode(inode);
@@ -91,7 +91,7 @@ xfs_page_trace(
                (void *)ip,
                (void *)inode,
                (void *)page,
-                (void *)((unsigned long)mask),
+                (void *)pgoff,
                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
                (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
@@ -105,7 +105,7 @@ xfs_page_trace(
                (void *)NULL);
 }
 #else
-#define xfs_page_trace(tag, inode, page, mask)
+#define xfs_page_trace(tag, inode, page, pgoff)
 #endif
 /*
@@ -1197,7 +1197,7 @@ xfs_vm_releasepage(
                .nr_to_write = 1,
        };
-        xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
+        xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
        if (!page_has_buffers(page))
                return 0;
@@ -1356,7 +1356,6 @@ xfs_end_io_direct(
                ioend->io_size = size;
                xfs_finish_ioend(ioend);
        } else {
-                ASSERT(size >= 0);
                xfs_destroy_ioend(ioend);
        }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2af528dcfb04..9bbadafdcb00 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -318,8 +318,12 @@ xfs_buf_free(
                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
                        free_address(bp->b_addr - bp->b_offset);
-                for (i = 0; i < bp->b_page_count; i++)
+                for (i = 0; i < bp->b_page_count; i++) {
-                        page_cache_release(bp->b_pages[i]);
+                        struct page     *page = bp->b_pages[i];
+                        ASSERT(!PagePrivate(page));
+                        page_cache_release(page);
+                }
                _xfs_buf_free_pages(bp);
        } else if (bp->b_flags & _XBF_KMEM_ALLOC) {
                 /*
@@ -400,6 +404,7 @@ _xfs_buf_lookup_pages(
                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
                size -= nbytes;
+                ASSERT(!PagePrivate(page));
                if (!PageUptodate(page)) {
                        page_count--;
                        if (blocksize >= PAGE_CACHE_SIZE) {
@@ -768,7 +773,7 @@ xfs_buf_get_noaddr(
        _xfs_buf_initialize(bp, target, 0, len, 0);
 try_again:
-        data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
+        data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE);
        if (unlikely(data == NULL))
                goto fail_free_buf;
@@ -1117,10 +1122,10 @@ xfs_buf_bio_end_io(
        do {
                struct page     *page = bvec->bv_page;
+                ASSERT(!PagePrivate(page));
                if (unlikely(bp->b_error)) {
                        if (bp->b_flags & XBF_READ)
                                ClearPageUptodate(page);
-                        SetPageError(page);
                } else if (blocksize >= PAGE_CACHE_SIZE) {
                        SetPageUptodate(page);
                } else if (!PagePrivate(page) &&
@@ -1156,16 +1161,16 @@ _xfs_buf_ioapply(
        total_nr_pages = bp->b_page_count;
        map_i = 0;
-        if (bp->b_flags & _XBF_RUN_QUEUES) {
-                bp->b_flags &= ~_XBF_RUN_QUEUES;
-                rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
-        } else {
-                rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
-        }
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
                rw = WRITE_BARRIER;
+        } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+                bp->b_flags &= ~_XBF_RUN_QUEUES;
+                rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+        } else {
+                rw = (bp->b_flags & XBF_WRITE) ? WRITE :
+                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
        }
        /* Special code path for reading a sub page size buffer in --
@@ -1681,6 +1686,7 @@ xfsbufd(
        xfs_buf_t               *bp, *n;
        struct list_head        *dwq = &target->bt_delwrite_queue;
        spinlock_t              *dwlk = &target->bt_delwrite_lock;
+        int                     count;
        current->flags |= PF_MEMALLOC;
@@ -1696,6 +1702,7 @@ xfsbufd(
                schedule_timeout_interruptible(
                        xfs_buf_timer_centisecs * msecs_to_jiffies(10));
+                count = 0;
                age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
                spin_lock(dwlk);
                list_for_each_entry_safe(bp, n, dwq, b_list) {
@@ -1711,9 +1718,11 @@ xfsbufd(
                                        break;
                                }
-                                bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+                                bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
+                                                 _XBF_RUN_QUEUES);
                                bp->b_flags |= XBF_WRITE;
-                                list_move(&bp->b_list, &tmp);
+                                list_move_tail(&bp->b_list, &tmp);
+                                count++;
                        }
                }
                spin_unlock(dwlk);
@@ -1724,12 +1733,12 @@ xfsbufd(
                        list_del_init(&bp->b_list);
                        xfs_buf_iostrategy(bp);
-                        blk_run_address_space(target->bt_mapping);
                }
                if (as_list_len > 0)
                        purge_addresses();
+                if (count)
+                        blk_run_address_space(target->bt_mapping);
                clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
        } while (!kthread_should_stop());
@@ -1767,7 +1776,7 @@ xfs_flush_buftarg(
                        continue;
                }
-                list_move(&bp->b_list, &tmp);
+                list_move_tail(&bp->b_list, &tmp);
        }
        spin_unlock(dwlk);
@@ -1776,7 +1785,7 @@ xfs_flush_buftarg(
         */
        list_for_each_entry_safe(bp, n, &tmp, b_list) {
                xfs_buf_lock(bp);
-                bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+                bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|_XBF_RUN_QUEUES);
                bp->b_flags |= XBF_WRITE;
                if (wait)
                        bp->b_flags &= ~XBF_ASYNC;
@@ -1786,6 +1795,9 @@ xfs_flush_buftarg(
                xfs_buf_iostrategy(bp);
        }
+        if (wait)
+                blk_run_address_space(target->bt_mapping);
        /*
         * Remaining list items must be flushed before returning
         */
@@ -1797,9 +1809,6 @@ xfs_flush_buftarg(
                xfs_buf_relse(bp);
        }
-        if (wait)
-                blk_run_address_space(target->bt_mapping);
        return pincount;
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 7858703ed84c..9dd235cb0107 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -298,11 +298,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_UNWRITE(bp)     ((bp)->b_flags &= ~XBF_WRITE)
 #define XFS_BUF_ISWRITE(bp)     ((bp)->b_flags & XBF_WRITE)
-#define XFS_BUF_ISUNINITIAL(bp) (0)
-#define XFS_BUF_UNUNINITIAL(bp) (0)
-#define XFS_BUF_BP_ISMAPPED(bp) (1)
 #define XFS_BUF_IODONE_FUNC(bp)                 ((bp)->b_iodone)
 #define XFS_BUF_SET_IODONE_FUNC(bp, func)       ((bp)->b_iodone = (func))
 #define XFS_BUF_CLR_IODONE_FUNC(bp)             ((bp)->b_iodone = NULL)
@@ -393,8 +388,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
-#define XFS_bdwrite(bp)         xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC)
 static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
        bp->b_strat = xfs_bdstrat_cb;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3d4f6dff2113..41cfcba7ce49 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -370,7 +370,7 @@ xfs_file_readdir(
        /* Try fairly hard to get memory */
        do {
-                if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
+                if ((read_buf = kmalloc(rlen, GFP_KERNEL)))
                        break;
                rlen >>= 1;
        } while (rlen >= 1024);
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 6c162c3dde7e..ed3a5e1b4b67 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -34,7 +34,7 @@ xfs_param_t xfs_params = {
        .restrict_chown = {     0,              1,              1       },
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
-        .panic_mask     = {     0,              0,              127     },
+        .panic_mask     = {     0,              0,              255     },
        .error_level    = {     0,              3,              11      },
        .syncd_timer    = {     1*100,          30*100,         7200*100},
        .stats_clear    = {     0,              0,              1       },
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 6e52a5dd38d8..a74f854d91e6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -653,7 +653,7 @@ xfs_attrmulti_by_handle(
 STATIC int
 xfs_ioc_space(
        bhv_desc_t              *bdp,
-        bhv_vnode_t             *vp,
+        struct inode            *inode,
        struct file             *filp,
        int                     flags,
        unsigned int            cmd,
@@ -735,7 +735,7 @@ xfs_ioctl(
                    !capable(CAP_SYS_ADMIN))
                        return -EPERM;
-                return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg);
+                return xfs_ioc_space(bdp, inode, filp, ioflags, cmd, arg);
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
@@ -763,6 +763,8 @@ xfs_ioctl(
                return xfs_ioc_fsgeometry(mp, arg);
        case XFS_IOC_GETVERSION:
+                return put_user(inode->i_generation, (int __user *)arg);
        case XFS_IOC_GETXFLAGS:
        case XFS_IOC_SETXFLAGS:
        case XFS_IOC_FSGETXATTR:
@@ -957,7 +959,7 @@ xfs_ioctl(
 STATIC int
 xfs_ioc_space(
        bhv_desc_t              *bdp,
-        bhv_vnode_t             *vp,
+        struct inode            *inode,
        struct file             *filp,
        int                     ioflags,
        unsigned int            cmd,
@@ -967,13 +969,13 @@ xfs_ioc_space(
        int                     attr_flags = 0;
        int                     error;
-        if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
                return -XFS_ERROR(EPERM);
        if (!(filp->f_mode & FMODE_WRITE))
                return -XFS_ERROR(EBADF);
-        if (!VN_ISREG(vp))
+        if (!S_ISREG(inode->i_mode))
                return -XFS_ERROR(EINVAL);
        if (copy_from_user(&bf, arg, sizeof(bf)))
@@ -1264,13 +1266,6 @@ xfs_ioc_xattr(
                break;
        }
-        case XFS_IOC_GETVERSION: {
-                flags = vn_to_inode(vp)->i_generation;
-                if (copy_to_user(arg, &flags, sizeof(flags)))
-                        error = -EFAULT;
-                break;
-        }
        default:
                error = -ENOTTY;
                break;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index d9180020de63..3ba814ae3bba 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -553,13 +553,13 @@ xfs_vn_follow_link(
        ASSERT(dentry);
        ASSERT(nd);
-        link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+        link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
        if (!link) {
                nd_set_link(nd, ERR_PTR(-ENOMEM));
                return NULL;
        }
-        uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
+        uio = kmalloc(sizeof(uio_t), GFP_KERNEL);
        if (!uio) {
                kfree(link);
                nd_set_link(nd, ERR_PTR(-ENOMEM));
@@ -623,12 +623,27 @@ xfs_vn_getattr(
 {
        struct inode    *inode = dentry->d_inode;
        bhv_vnode_t     *vp = vn_from_inode(inode);
-        int             error = 0;
+        bhv_vattr_t     vattr = { .va_mask = XFS_AT_STAT };
+        int             error;
-        if (unlikely(vp->v_flag & VMODIFIED))
+        error = bhv_vop_getattr(vp, &vattr, ATTR_LAZY, NULL);
-                error = vn_revalidate(vp);
+        if (likely(!error)) {
-        if (!error)
+                stat->size = i_size_read(inode);
-                generic_fillattr(inode, stat);
+                stat->dev = inode->i_sb->s_dev;
+                stat->rdev = (vattr.va_rdev == 0) ? 0 :
+                                MKDEV(sysv_major(vattr.va_rdev) & 0x1ff,
+                                      sysv_minor(vattr.va_rdev));
+                stat->mode = vattr.va_mode;
+                stat->nlink = vattr.va_nlink;
+                stat->uid = vattr.va_uid;
+                stat->gid = vattr.va_gid;
+                stat->ino = vattr.va_nodeid;
+                stat->atime = vattr.va_atime;
+                stat->mtime = vattr.va_mtime;
+                stat->ctime = vattr.va_ctime;
+                stat->blocks = vattr.va_nblocks;
+                stat->blksize = vattr.va_blocksize;
+        }
        return -error;
 }
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index a13f75c1a936..2b0e0018738a 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -148,11 +148,7 @@ BUFFER_FNS(PrivateStart, unwritten);
                (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
 #define NBPP            PAGE_SIZE
-#define DPPSHFT         (PAGE_SHIFT - 9)
 #define NDPP            (1 << (PAGE_SHIFT - 9))
-#define dtop(DD)        (((DD) + NDPP - 1) >> DPPSHFT)
-#define dtopt(DD)       ((DD) >> DPPSHFT)
-#define dpoff(DD)       ((DD) & (NDPP-1))
 #define NBBY            8               /* number of bits per byte */
 #define NBPC            PAGE_SIZE       /* Number of bytes per click */
@@ -172,8 +168,6 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define btoct(x)        ((__psunsigned_t)(x)>>BPCSHIFT)
 #define btoc64(x)       (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
 #define btoct64(x)      ((__uint64_t)(x)>>BPCSHIFT)
-#define io_btoc(x)      (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
-#define io_btoct(x)     ((__psunsigned_t)(x)>>IO_BPCSHIFT)
 /* off_t bytes to clicks */
 #define offtoc(x)       (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
@@ -186,7 +180,6 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define ctob(x)         ((__psunsigned_t)(x)<<BPCSHIFT)
 #define btoct(x)        ((__psunsigned_t)(x)>>BPCSHIFT)
 #define ctob64(x)       ((__uint64_t)(x)<<BPCSHIFT)
-#define io_ctob(x)      ((__psunsigned_t)(x)<<IO_BPCSHIFT)
 /* bytes to clicks */
 #define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
@@ -339,4 +332,11 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
        return(x * y);
 }
+static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+{
+        x += y - 1;
+        do_div(x, y);
+        return x;
+}
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index ee788b1cb364..55992b40353c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -270,12 +270,12 @@ xfs_read(
                }
        }
-        if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp)))
+        if (unlikely(ioflags & IO_ISDIRECT)) {
-                bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
+                if (VN_CACHED(vp))
-                                                -1, FI_REMAPF_LOCKED);
+                        bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
+                                                 -1, FI_REMAPF_LOCKED);
-        if (unlikely(ioflags & IO_ISDIRECT))
                mutex_unlock(&inode->i_mutex);
+        }
        xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
                                (void *)iovp, segs, *offset, ioflags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4754f342a5d3..38c4d128a8c0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -171,7 +171,6 @@ xfs_revalidate_inode(
                break;
        }
-        inode->i_blksize = xfs_preferred_iosize(mp);
        inode->i_generation = ip->i_d.di_gen;
        i_size_write(inode, ip->i_d.di_size);
        inode->i_blocks =
@@ -228,7 +227,9 @@ xfs_initialize_vnode(
                xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
                xfs_set_inodeops(inode);
+                spin_lock(&ip->i_flags_lock);
                ip->i_flags &= ~XFS_INEW;
+                spin_unlock(&ip->i_flags_lock);
                barrier();
                unlock_new_inode(inode);
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 91fc2c4b3353..da255bdf5260 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -79,7 +79,7 @@ typedef enum {
 #define VFS_RDONLY              0x0001  /* read-only vfs */
 #define VFS_GRPID               0x0002  /* group-ID assigned from directory */
 #define VFS_DMI                 0x0004  /* filesystem has the DMI enabled */
-#define VFS_UMOUNT              0x0008  /* unmount in progress */
+/* ---- VFS_UMOUNT ----         0x0008  -- unneeded, fixed via kthread APIs */
 #define VFS_32BITINODES         0x0010  /* do not use inums above 32 bits */
 #define VFS_END                 0x0010  /* max flag */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 6628d96b6fd6..553fa731ade5 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -122,7 +122,6 @@ vn_revalidate_core(
        inode->i_blocks     = vap->va_nblocks;
        inode->i_mtime      = vap->va_mtime;
        inode->i_ctime      = vap->va_ctime;
-        inode->i_blksize    = vap->va_blocksize;
        if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
                inode->i_flags |= S_IMMUTABLE;
        else
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index c42b3221b20c..515f5fdea57a 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -85,8 +85,6 @@ typedef enum {
 #define VN_BHV_HEAD(vp)                 ((bhv_head_t *)(&((vp)->v_bh)))
 #define vn_bhv_head_init(bhp,name)      bhv_head_init(bhp,name)
 #define vn_bhv_remove(bhp,bdp)          bhv_remove(bhp,bdp)
-#define vn_bhv_lookup(bhp,ops)          bhv_lookup(bhp,ops)
-#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
 /*
 * Vnode to Linux inode mapping.
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 5b2dcc58b244..33ad5af386e0 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -382,18 +382,6 @@ xfs_qm_dquot_logitem_unlock(
 /*
- * The transaction with the dquot locked has aborted.  The dquot
- * must not be dirty within the transaction.  We simply unlock just
- * as if the transaction had been cancelled.
- */
-STATIC void
-xfs_qm_dquot_logitem_abort(
-        xfs_dq_logitem_t    *ql)
-{
-        xfs_qm_dquot_logitem_unlock(ql);
-}
-/*
 * this needs to stamp an lsn into the dquot, I think.
 * rpc's that look at user dquot's would then have to
 * push on the dependency recorded in the dquot
@@ -426,7 +414,6 @@ STATIC struct xfs_item_ops xfs_dquot_item_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_qm_dquot_logitem_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
        .iop_pushbuf    = (void(*)(xfs_log_item_t*))
                                        xfs_qm_dquot_logitem_pushbuf,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
@@ -559,17 +546,6 @@ xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
 }
 /*
- * The transaction of which this QUOTAOFF is a part has been aborted.
- * Just clean up after ourselves.
- * Shouldn't this never happen in the case of qoffend logitems? XXX
- */
-STATIC void
-xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
-{
-        kmem_free(qf, sizeof(xfs_qoff_logitem_t));
-}
-/*
 * There isn't much you can do to push on an quotaoff item.  It is simply
 * stuck waiting for the log to be flushed to disk.
 */
@@ -644,7 +620,6 @@ STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_qm_qoffend_logitem_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
        .iop_pushbuf    = NULL,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_qm_qoffend_logitem_committing
@@ -667,7 +642,6 @@ STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_qm_qoff_logitem_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
        .iop_pushbuf    = NULL,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_qm_qoff_logitem_committing
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index e23e45535c48..7c6a3a50379e 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -112,17 +112,17 @@ xfs_Gqm_init(void)
 {
        xfs_dqhash_t    *udqhash, *gdqhash;
        xfs_qm_t        *xqm;
-        uint            i, hsize, flags = KM_SLEEP | KM_MAYFAIL;
+        size_t          hsize;
+        uint            i;
        /*
         * Initialize the dquot hash tables.
         */
-        hsize = XFS_QM_HASHSIZE_HIGH;
+        udqhash = kmem_zalloc_greedy(&hsize,
-        while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) {
+                                     XFS_QM_HASHSIZE_LOW, XFS_QM_HASHSIZE_HIGH,
-                if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW)
+                                     KM_SLEEP | KM_MAYFAIL | KM_LARGE);
-                        flags = KM_SLEEP;
+        gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE);
-        }
+        hsize /= sizeof(xfs_dqhash_t);
-        gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP);
        ndquot = hsize << 8;
        xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 4568deb6da86..689407de0a20 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -56,12 +56,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_HASHSIZE_HIGH            ((NBPP * 4) / sizeof(xfs_dqhash_t))
 /*
- * We output a cmn_err when quotachecking a quota file with more than
- * this many fsbs.
- */
-#define XFS_QM_BIG_QCHECK_NBLKS         500
-/*
 * This defines the unit of allocation of dquots.
 * Currently, it is just one file system block, and a 4K blk contains 30
 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index b7ddd04aae32..a8b85e2be9d5 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -75,7 +75,6 @@ static inline int XQMISLCKD(struct xfs_dqhash *h)
 #define xfs_qm_freelist_lock(qm)        XQMLCK(&((qm)->qm_dqfreelist))
 #define xfs_qm_freelist_unlock(qm)      XQMUNLCK(&((qm)->qm_dqfreelist))
-#define XFS_QM_IS_FREELIST_LOCKED(qm)   XQMISLCKD(&((qm)->qm_dqfreelist))
 /*
 * Hash into a bucket in the dquot hash table, based on <mp, id>.
@@ -170,6 +169,5 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
 #define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
                                 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-#define DQFLAGTO_DIRTYSTR(d)    (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
 #endif  /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index addf5a7ea06c..5cf2e86caa71 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -75,7 +75,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
                                                            sleep);
        } else {
                ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
-                                                            sleep);
+                                                            sleep | KM_LARGE);
        }
        if (ktep == NULL) {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index dc2361dd740a..9ece7f87ec5b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -150,7 +150,7 @@ typedef struct xfs_agi {
 #define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)XFS_BUF_PTR(bp))
 typedef struct xfs_agfl {
-        xfs_agblock_t   agfl_bno[1];    /* actually XFS_AGFL_SIZE(mp) */
+        __be32          agfl_bno[1];    /* actually XFS_AGFL_SIZE(mp) */
 } xfs_agfl_t;
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index d2bbcd882a69..e80dda3437d1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1477,8 +1477,10 @@ xfs_alloc_ag_vextent_small(
        /*
         * Can't allocate from the freelist for some reason.
         */
-        else
+        else {
+                fbno = NULLAGBLOCK;
                flen = 0;
+        }
        /*
         * Can't do the allocation, give up.
         */
@@ -2021,7 +2023,7 @@ xfs_alloc_get_freelist(
        /*
         * Get the block number and update the data structures.
         */
-        bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT);
+        bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
        be32_add(&agf->agf_flfirst, 1);
        xfs_trans_brelse(tp, agflbp);
        if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
@@ -2108,7 +2110,7 @@ xfs_alloc_put_freelist(
 {
        xfs_agf_t               *agf;   /* a.g. freespace structure */
        xfs_agfl_t              *agfl;  /* a.g. free block array */
-        xfs_agblock_t           *blockp;/* pointer to array entry */
+        __be32                  *blockp;/* pointer to array entry */
        int                     error;
 #ifdef XFS_ALLOC_TRACE
        static char             fname[] = "xfs_alloc_put_freelist";
@@ -2132,7 +2134,7 @@ xfs_alloc_put_freelist(
        pag->pagf_flcount++;
        ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
        blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
-        INT_SET(*blockp, ARCH_CONVERT, bno);
+        *blockp = cpu_to_be32(bno);
        TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
        xfs_trans_log_buf(tp, agflbp,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 7446556e8021..74cadf95d4e8 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -92,6 +92,7 @@ xfs_alloc_delrec(
        xfs_alloc_key_t         *rkp;   /* right block key pointer */
        xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
        int                     rrecs=0;        /* number of records in right block */
+        int                     numrecs;
        xfs_alloc_rec_t         *rrp;   /* right block record pointer */
        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
@@ -115,7 +116,8 @@ xfs_alloc_delrec(
        /*
         * Fail if we're off the end of the block.
         */
-        if (ptr > be16_to_cpu(block->bb_numrecs)) {
+        numrecs = be16_to_cpu(block->bb_numrecs);
+        if (ptr > numrecs) {
                *stat = 0;
                return 0;
        }
@@ -129,18 +131,18 @@ xfs_alloc_delrec(
                lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
                lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
 #ifdef DEBUG
-                for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) {
+                for (i = ptr; i < numrecs; i++) {
                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
                                return error;
                }
 #endif
-                if (ptr < be16_to_cpu(block->bb_numrecs)) {
+                if (ptr < numrecs) {
                        memmove(&lkp[ptr - 1], &lkp[ptr],
-                                (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp));
+                                (numrecs - ptr) * sizeof(*lkp));
                        memmove(&lpp[ptr - 1], &lpp[ptr],
-                                (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp));
+                                (numrecs - ptr) * sizeof(*lpp));
-                        xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
+                        xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
+                        xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
                }
        }
        /*
@@ -149,10 +151,10 @@ xfs_alloc_delrec(
         */
        else {
                lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                if (ptr < be16_to_cpu(block->bb_numrecs)) {
+                if (ptr < numrecs) {
                        memmove(&lrp[ptr - 1], &lrp[ptr],
-                                (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp));
+                                (numrecs - ptr) * sizeof(*lrp));
-                        xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
+                        xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
                }
                /*
                 * If it's the first record in the block, we'll need a key
@@ -167,7 +169,8 @@ xfs_alloc_delrec(
        /*
         * Decrement and log the number of entries in the block.
         */
-        be16_add(&block->bb_numrecs, -1);
+        numrecs--;
+        block->bb_numrecs = cpu_to_be16(numrecs);
        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
        /*
         * See if the longest free extent in the allocation group was
@@ -181,14 +184,14 @@ xfs_alloc_delrec(
        if (level == 0 &&
            cur->bc_btnum == XFS_BTNUM_CNT &&
            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr > be16_to_cpu(block->bb_numrecs)) {
+            ptr > numrecs) {
-                ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1);
+                ASSERT(ptr == numrecs + 1);
                /*
                 * There are still records in the block.  Grab the size
                 * from the last one.
                 */
-                if (be16_to_cpu(block->bb_numrecs)) {
+                if (numrecs) {
-                        rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur);
+                        rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
                        agf->agf_longest = rrp->ar_blockcount;
                }
                /*
@@ -211,7 +214,7 @@ xfs_alloc_delrec(
                 * and it's NOT the leaf level,
                 * then we can get rid of this level.
                 */
-                if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) {
+                if (numrecs == 1 && level > 0) {
                        /*
                         * lpp is still set to the first pointer in the block.
                         * Make it the new root of the btree.
@@ -267,7 +270,7 @@ xfs_alloc_delrec(
         * If the number of records remaining in the block is at least
         * the minimum, we're done.
         */
-        if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+        if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
                        return error;
                *stat = 1;
@@ -419,19 +422,21 @@ xfs_alloc_delrec(
         * See if we can join with the left neighbor block.
         */
        if (lbno != NULLAGBLOCK &&
-            lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+            lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
                /*
                 * Set "right" to be the starting block,
                 * "left" to be the left neighbor.
                 */
                rbno = bno;
                right = block;
+                rrecs = be16_to_cpu(right->bb_numrecs);
                rbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
                                cur->bc_private.a.agno, lbno, 0, &lbp,
                                XFS_ALLOC_BTREE_REF)))
                        return error;
                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+                lrecs = be16_to_cpu(left->bb_numrecs);
                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
                        return error;
        }
@@ -439,20 +444,21 @@ xfs_alloc_delrec(
         * If that won't work, see if we can join with the right neighbor block.
         */
        else if (rbno != NULLAGBLOCK &&
-                 rrecs + be16_to_cpu(block->bb_numrecs) <=
+                 rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                  XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
                /*
                 * Set "left" to be the starting block,
                 * "right" to be the right neighbor.
                 */
                lbno = bno;
                left = block;
+                lrecs = be16_to_cpu(left->bb_numrecs);
                lbp = bp;
                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
                                cur->bc_private.a.agno, rbno, 0, &rbp,
                                XFS_ALLOC_BTREE_REF)))
                        return error;
                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+                rrecs = be16_to_cpu(right->bb_numrecs);
                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
                        return error;
        }
@@ -474,34 +480,28 @@ xfs_alloc_delrec(
                /*
                 * It's a non-leaf.  Move keys and pointers.
                 */
-                lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
+                lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
+                lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
 #ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
+                for (i = 0; i < rrecs; i++) {
                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
                                return error;
                }
 #endif
-                memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp));
+                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp));
+                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
+                xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                                   be16_to_cpu(left->bb_numrecs) +
+                xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-                                   be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-                                   be16_to_cpu(left->bb_numrecs) +
-                                   be16_to_cpu(right->bb_numrecs));
        } else {
                /*
                 * It's a leaf.  Move records.
                 */
-                lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
+                lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp));
+                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
+                xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-                                   be16_to_cpu(left->bb_numrecs) +
-                                   be16_to_cpu(right->bb_numrecs));
        }
        /*
         * If we joined with the left neighbor, set the buffer in the
@@ -509,7 +509,7 @@ xfs_alloc_delrec(
         */
        if (bp != lbp) {
                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs);
+                cur->bc_ptrs[level] += lrecs;
        }
        /*
         * If we joined with the right neighbor and there's a level above
@@ -521,7 +521,8 @@ xfs_alloc_delrec(
        /*
         * Fix up the number of records in the surviving block.
         */
-        be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs));
+        lrecs += rrecs;
+        left->bb_numrecs = cpu_to_be16(lrecs);
        /*
         * Fix up the right block pointer in the surviving block, and log it.
         */
@@ -608,6 +609,7 @@ xfs_alloc_insrec(
        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
        xfs_alloc_key_t         nkey;   /* new key value, from split */
        xfs_alloc_rec_t         nrec;   /* new record value, for caller */
+        int                     numrecs;
        int                     optr;   /* old ptr value */
        xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
        int                     ptr;    /* index in btree block for this rec */
@@ -653,13 +655,14 @@ xfs_alloc_insrec(
         */
        bp = cur->bc_bufs[level];
        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        numrecs = be16_to_cpu(block->bb_numrecs);
 #ifdef DEBUG
        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
                return error;
        /*
         * Check that the new entry is being inserted in the right place.
         */
-        if (ptr <= be16_to_cpu(block->bb_numrecs)) {
+        if (ptr <= numrecs) {
                if (level == 0) {
                        rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
@@ -670,12 +673,12 @@ xfs_alloc_insrec(
        }
 #endif
        nbno = NULLAGBLOCK;
-        ncur = (xfs_btree_cur_t *)0;
+        ncur = NULL;
        /*
         * If the block is full, we can't insert the new entry until we
         * make the block un-full.
         */
-        if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+        if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
                /*
                 * First, try shifting an entry to the right neighbor.
                 */
@@ -729,6 +732,7 @@ xfs_alloc_insrec(
         * At this point we know there's room for our new entry in the block
         * we're pointing at.
         */
+        numrecs = be16_to_cpu(block->bb_numrecs);
        if (level > 0) {
                /*
                 * It's a non-leaf entry.  Make a hole for the new data
@@ -737,15 +741,15 @@ xfs_alloc_insrec(
                kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
                pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
 #ifdef DEBUG
-                for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) {
+                for (i = numrecs; i >= ptr; i--) {
                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
                                return error;
                }
 #endif
                memmove(&kp[ptr], &kp[ptr - 1],
-                        (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp));
+                        (numrecs - ptr + 1) * sizeof(*kp));
                memmove(&pp[ptr], &pp[ptr - 1],
-                        (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp));
+                        (numrecs - ptr + 1) * sizeof(*pp));
 #ifdef DEBUG
                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
                        return error;
@@ -755,11 +759,12 @@ xfs_alloc_insrec(
                 */
                kp[ptr - 1] = key;
                pp[ptr - 1] = cpu_to_be32(*bnop);
-                be16_add(&block->bb_numrecs, 1);
+                numrecs++;
-                xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
+                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
+                xfs_alloc_log_keys(cur, bp, ptr, numrecs);
+                xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
 #ifdef DEBUG
-                if (ptr < be16_to_cpu(block->bb_numrecs))
+                if (ptr < numrecs)
                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
                                kp + ptr);
 #endif
@@ -769,16 +774,17 @@ xfs_alloc_insrec(
                 */
                rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
                memmove(&rp[ptr], &rp[ptr - 1],
-                        (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp));
+                        (numrecs - ptr + 1) * sizeof(*rp));
                /*
                 * Now stuff the new record in, bump numrecs
                 * and log the new data.
                 */
-                rp[ptr - 1] = *recp; /* INT_: struct copy */
+                rp[ptr - 1] = *recp;
-                be16_add(&block->bb_numrecs, 1);
+                numrecs++;
-                xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
+                block->bb_numrecs = cpu_to_be16(numrecs);
+                xfs_alloc_log_recs(cur, bp, ptr, numrecs);
 #ifdef DEBUG
-                if (ptr < be16_to_cpu(block->bb_numrecs))
+                if (ptr < numrecs)
                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
                                rp + ptr);
 #endif
@@ -819,8 +825,8 @@ xfs_alloc_insrec(
         */
        *bnop = nbno;
        if (nbno != NULLAGBLOCK) {
-                *recp = nrec; /* INT_: struct copy */
+                *recp = nrec;
-                *curp = ncur; /* INT_: struct copy */
+                *curp = ncur;
        }
        *stat = 1;
        return 0;
@@ -981,7 +987,7 @@ xfs_alloc_lookup(
                 */
                bp = cur->bc_bufs[level];
                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = (xfs_buf_t *)0;
+                        bp = NULL;
                if (!bp) {
                        /*
                         * Need to get a new buffer.  Read it, then
@@ -1229,7 +1235,7 @@ xfs_alloc_lshift(
                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
                        return error;
 #endif
-                *lpp = *rpp; /* INT_: copy */
+                *lpp = *rpp;
                xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
        }
@@ -1406,8 +1412,8 @@ xfs_alloc_newroot(
                kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
                if (be16_to_cpu(left->bb_level) > 0) {
-                        kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
+                        kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
-                        kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
+                        kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
                } else {
                        xfs_alloc_rec_t *rp;    /* btree record pointer */
@@ -1527,8 +1533,8 @@ xfs_alloc_rshift(
                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
                        return error;
 #endif
-                *rkp = *lkp; /* INT_: copy */
+                *rkp = *lkp;
-                *rpp = *lpp; /* INT_: copy */
+                *rpp = *lpp;
                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
@@ -2044,7 +2050,7 @@ xfs_alloc_insert(
        nbno = NULLAGBLOCK;
        nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
        nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-        ncur = (xfs_btree_cur_t *)0;
+        ncur = NULL;
        pcur = cur;
        /*
         * Loop going up the tree, starting at the leaf level.
@@ -2076,7 +2082,7 @@ xfs_alloc_insert(
                 */
                if (ncur) {
                        pcur = ncur;
-                        ncur = (xfs_btree_cur_t *)0;
+                        ncur = NULL;
                }
        } while (nbno != NULLAGBLOCK);
        *stat = i;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1a2101043275..9ada7bdbae52 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -91,7 +91,6 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
 /*
 * Routines to manipulate out-of-line attribute values.
 */
-STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
 STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
 STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
@@ -180,7 +179,7 @@ xfs_attr_get(bhv_desc_t *bdp, const char *name, char *value, int *valuelenp,
        return(error);
 }
-STATIC int
+int
 xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
                 char *value, int valuelen, int flags)
 {
@@ -440,7 +439,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f
 * Generic handler routine to remove a name from an attribute list.
 * Transitions attribute list from Btree to shortform as necessary.
 */
-STATIC int
+int
 xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
 {
        xfs_da_args_t   args;
@@ -591,6 +590,110 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
        return xfs_attr_remove_int(dp, name, namelen, flags);
 }
+int                                                             /* error */
+xfs_attr_list_int(xfs_attr_list_context_t *context)
+{
+        int error;
+        xfs_inode_t *dp = context->dp;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (XFS_IFORK_Q(dp) == 0 ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             dp->i_d.di_anextents == 0)) {
+                error = 0;
+        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+                error = xfs_attr_shortform_list(context);
+        } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                error = xfs_attr_leaf_list(context);
+        } else {
+                error = xfs_attr_node_list(context);
+        }
+        return error;
+}
+#define ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
+        (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
+        ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+         & ~(sizeof(u_int32_t)-1))
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp,
+                     char *name, int namelen,
+                     int valuelen, char *value)
+{
+        attrlist_ent_t *aep;
+        int arraytop;
+        ASSERT(!(context->flags & ATTR_KERNOVAL));
+        ASSERT(context->count >= 0);
+        ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+        ASSERT(context->firstu >= sizeof(*context->alist));
+        ASSERT(context->firstu <= context->bufsize);
+        arraytop = sizeof(*context->alist) +
+                        context->count * sizeof(context->alist->al_offset[0]);
+        context->firstu -= ATTR_ENTSIZE(namelen);
+        if (context->firstu < arraytop) {
+                xfs_attr_trace_l_c("buffer full", context);
+                context->alist->al_more = 1;
+                context->seen_enough = 1;
+                return 1;
+        }
+        aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+        aep->a_valuelen = valuelen;
+        memcpy(aep->a_name, name, namelen);
+        aep->a_name[ namelen ] = 0;
+        context->alist->al_offset[ context->count++ ] = context->firstu;
+        context->alist->al_count = context->count;
+        xfs_attr_trace_l_c("add", context);
+        return 0;
+}
+STATIC int
+xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp,
+                     char *name, int namelen,
+                     int valuelen, char *value)
+{
+        char *offset;
+        int arraytop;
+        ASSERT(context->count >= 0);
+        arraytop = context->count + namesp->attr_namelen + namelen + 1;
+        if (arraytop > context->firstu) {
+                context->count = -1;    /* insufficient space */
+                return 1;
+        }
+        offset = (char *)context->alist + context->count;
+        strncpy(offset, namesp->attr_name, namesp->attr_namelen);
+        offset += namesp->attr_namelen;
+        strncpy(offset, name, namelen);                 /* real name */
+        offset += namelen;
+        *offset = '\0';
+        context->count += namesp->attr_namelen + namelen + 1;
+        return 0;
+}
+/*ARGSUSED*/
+STATIC int
+xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp,
+                     char *name, int namelen,
+                     int valuelen, char *value)
+{
+        context->count += namesp->attr_namelen + namelen + 1;
+        return 0;
+}
 /*
 * Generate a list of extended attribute names and optionally
 * also value lengths.  Positive return value follows the XFS
@@ -615,13 +718,13 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
                return(XFS_ERROR(EINVAL));
        if ((cursor->initted == 0) &&
            (cursor->hashval || cursor->blkno || cursor->offset))
-                return(XFS_ERROR(EINVAL));
+                return XFS_ERROR(EINVAL);
        /*
         * Check for a properly aligned buffer.
         */
        if (((long)buffer) & (sizeof(int)-1))
-                return(XFS_ERROR(EFAULT));
+                return XFS_ERROR(EFAULT);
        if (flags & ATTR_KERNOVAL)
                bufsize = 0;
@@ -634,53 +737,47 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
        context.dupcnt = 0;
        context.resynch = 1;
        context.flags = flags;
-        if (!(flags & ATTR_KERNAMELS)) {
+        context.seen_enough = 0;
+        context.alist = (attrlist_t *)buffer;
+        context.put_value = 0;
+        if (flags & ATTR_KERNAMELS) {
+                context.bufsize = bufsize;
+                context.firstu = context.bufsize;
+                if (flags & ATTR_KERNOVAL)
+                        context.put_listent = xfs_attr_kern_list_sizes;
+                else
+                        context.put_listent = xfs_attr_kern_list;
+        } else {
                context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
                context.firstu = context.bufsize;
-                context.alist = (attrlist_t *)buffer;
                context.alist->al_count = 0;
                context.alist->al_more = 0;
                context.alist->al_offset[0] = context.bufsize;
-        }
+                context.put_listent = xfs_attr_put_listent;
-        else {
-                context.bufsize = bufsize;
-                context.firstu = context.bufsize;
-                context.alist = (attrlist_t *)buffer;
        }
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return (EIO);
+                return EIO;
        xfs_ilock(dp, XFS_ILOCK_SHARED);
-        /*
-         * Decide on what work routines to call based on the inode size.
-         */
        xfs_attr_trace_l_c("syscall start", &context);
-        if (XFS_IFORK_Q(dp) == 0 ||
-            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+        error = xfs_attr_list_int(&context);
-             dp->i_d.di_anextents == 0)) {
-                error = 0;
-        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-                error = xfs_attr_shortform_list(&context);
-        } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-                error = xfs_attr_leaf_list(&context);
-        } else {
-                error = xfs_attr_node_list(&context);
-        }
        xfs_iunlock(dp, XFS_ILOCK_SHARED);
        xfs_attr_trace_l_c("syscall end", &context);
-        if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
+        if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) {
-                ASSERT(error >= 0);
+                /* must return negated buffer size or the error */
-        }
-        else {  /* must return negated buffer size or the error */
                if (context.count < 0)
                        error = XFS_ERROR(ERANGE);
                else
                        error = -context.count;
-        }
+        } else
+                ASSERT(error >= 0);
-        return(error);
+        return error;
 }
 int                                                             /* error */
@@ -1122,19 +1219,19 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
        context->cursor->blkno = 0;
        error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
        if (error)
-                return(error);
+                return XFS_ERROR(error);
        ASSERT(bp != NULL);
        leaf = bp->data;
        if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) {
                XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
                                     context->dp->i_mount, leaf);
                xfs_da_brelse(NULL, bp);
-                return(XFS_ERROR(EFSCORRUPTED));
+                return XFS_ERROR(EFSCORRUPTED);
        }
-        (void)xfs_attr_leaf_list_int(bp, context);
+        error = xfs_attr_leaf_list_int(bp, context);
        xfs_da_brelse(NULL, bp);
-        return(0);
+        return XFS_ERROR(error);
 }
@@ -1858,8 +1955,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        return(XFS_ERROR(EFSCORRUPTED));
                }
                error = xfs_attr_leaf_list_int(bp, context);
-                if (error || !leaf->hdr.info.forw)
+                if (error) {
-                        break;  /* not really an error, buffer full or EOF */
+                        xfs_da_brelse(NULL, bp);
+                        return error;
+                }
+                if (context->seen_enough || leaf->hdr.info.forw == 0)
+                        break;
                cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
                xfs_da_brelse(NULL, bp);
                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
@@ -1886,7 +1987,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 * Read the value associated with an attribute from the out-of-line buffer
 * that we stored it in.
 */
-STATIC int
+int
 xfs_attr_rmtval_get(xfs_da_args_t *args)
 {
        xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 981633f6c077..783977d3ea71 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -37,6 +37,7 @@
 struct cred;
 struct bhv_vnode;
+struct xfs_attr_list_context;
 typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int);
 typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int);
@@ -160,13 +161,16 @@ struct xfs_da_args;
 */
 int xfs_attr_get(bhv_desc_t *, const char *, char *, int *, int, struct cred *);
 int xfs_attr_set(bhv_desc_t *, const char *, char *, int, int, struct cred *);
+int xfs_attr_set_int(struct xfs_inode *, const char *, int, char *, int, int);
 int xfs_attr_remove(bhv_desc_t *, const char *, int, struct cred *);
-int xfs_attr_list(bhv_desc_t *, char *, int, int,
+int xfs_attr_remove_int(struct xfs_inode *, const char *, int, int);
-                         struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_list(bhv_desc_t *, char *, int, int, struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 int xfs_attr_shortform_getvalue(struct xfs_da_args *);
 int xfs_attr_fetch(struct xfs_inode *, const char *, int,
                        char *, int *, int, struct cred *);
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
 #endif  /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 9455051f0120..9719bbef122c 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -89,9 +89,46 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
                                         int dst_start, int move_count,
                                         xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context,
-                             attrnames_t *, char *name, int namelen,
+/*========================================================================
-                             int valuelen);
+ * Namespace helper routines
+ *========================================================================*/
+STATIC inline attrnames_t *
+xfs_attr_flags_namesp(int flags)
+{
+        return ((flags & XFS_ATTR_SECURE) ? &attr_secure:
+                  ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user));
+}
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC inline int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+        return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+/*
+ * If namespace bits don't match and we don't have an override for it
+ * then return 0.
+ * If all match or are overridable then return 1.
+ */
+STATIC inline int
+xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags)
+{
+        if (((arg_flags & ATTR_SECURE) == 0) !=
+            ((ondisk_flags & XFS_ATTR_SECURE) == 0) &&
+            !(arg_flags & ATTR_KERNORMALS))
+                return 0;
+        if (((arg_flags & ATTR_ROOT) == 0) !=
+            ((ondisk_flags & XFS_ATTR_ROOT) == 0) &&
+            !(arg_flags & ATTR_KERNROOTLS))
+                return 0;
+        return 1;
+}
 /*========================================================================
@@ -228,11 +265,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
                        continue;
                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
                        continue;
-                if (((args->flags & ATTR_SECURE) != 0) !=
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-                        continue;
-                if (((args->flags & ATTR_ROOT) != 0) !=
-                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
                        continue;
                ASSERT(0);
 #endif
@@ -246,8 +279,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
        sfe->namelen = args->namelen;
        sfe->valuelen = args->valuelen;
-        sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
+        sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-                        ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
        memcpy(sfe->nameval, args->name, args->namelen);
        memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
        sf->hdr.count++;
@@ -282,11 +314,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
                        continue;
                if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
                        continue;
-                if (((args->flags & ATTR_SECURE) != 0) !=
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-                        continue;
-                if (((args->flags & ATTR_ROOT) != 0) !=
-                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
                        continue;
                break;
        }
@@ -363,11 +391,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
                        continue;
                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
                        continue;
-                if (((args->flags & ATTR_SECURE) != 0) !=
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-                        continue;
-                if (((args->flags & ATTR_ROOT) != 0) !=
-                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
                        continue;
                return(XFS_ERROR(EEXIST));
        }
@@ -394,11 +418,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
                        continue;
                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
                        continue;
-                if (((args->flags & ATTR_SECURE) != 0) !=
+                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
-                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-                        continue;
-                if (((args->flags & ATTR_ROOT) != 0) !=
-                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
                        continue;
                if (args->flags & ATTR_KERNOVAL) {
                        args->valuelen = sfe->valuelen;
@@ -485,8 +505,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
                nargs.valuelen = sfe->valuelen;
                nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
                                                sfe->namelen);
-                nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
+                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
-                                ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
                error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
                ASSERT(error == ENOATTR);
                error = xfs_attr_leaf_add(bp, &nargs);
@@ -520,6 +539,10 @@ xfs_attr_shortform_compare(const void *a, const void *b)
        }
 }
+#define XFS_ISRESET_CURSOR(cursor) \
+        (!((cursor)->initted) && !((cursor)->hashval) && \
+         !((cursor)->blkno) && !((cursor)->offset))
 /*
 * Copy out entries of shortform attribute lists for attr_list().
 * Shortform attribute lists are not stored in hashval sorted order.
@@ -537,6 +560,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        xfs_attr_sf_entry_t *sfe;
        xfs_inode_t *dp;
        int sbsize, nsbuf, count, i;
+        int error;
        ASSERT(context != NULL);
        dp = context->dp;
@@ -552,46 +576,51 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        xfs_attr_trace_l_c("sf start", context);
        /*
-         * If the buffer is large enough, do not bother with sorting.
+         * If the buffer is large enough and the cursor is at the start,
+         * do not bother with sorting since we will return everything in
+         * one buffer and another call using the cursor won't need to be
+         * made.
         * Note the generous fudge factor of 16 overhead bytes per entry.
+         * If bufsize is zero then put_listent must be a search function
+         * and can just scan through what we have.
         */
-        if ((dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize) {
+        if (context->bufsize == 0 ||
+            (XFS_ISRESET_CURSOR(cursor) &&
+             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
                for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
                        attrnames_t     *namesp;
-                        if (((context->flags & ATTR_SECURE) != 0) !=
+                        if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
-                            ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
-                            !(context->flags & ATTR_KERNORMALS)) {
-                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-                                continue;
-                        }
-                        if (((context->flags & ATTR_ROOT) != 0) !=
-                            ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
-                            !(context->flags & ATTR_KERNROOTLS)) {
                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                                continue;
                        }
-                        namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure:
+                        namesp = xfs_attr_flags_namesp(sfe->flags);
-                                ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted :
+                        error = context->put_listent(context,
-                                  &attr_user);
+                                           namesp,
-                        if (context->flags & ATTR_KERNOVAL) {
+                                           (char *)sfe->nameval,
-                                ASSERT(context->flags & ATTR_KERNAMELS);
+                                           (int)sfe->namelen,
-                                context->count += namesp->attr_namelen +
+                                           (int)sfe->valuelen,
-                                        sfe->namelen + 1;
+                                           (char*)&sfe->nameval[sfe->namelen]);
-                        }
-                        else {
+                        /*
-                                if (xfs_attr_put_listent(context, namesp,
+                         * Either search callback finished early or
-                                                   (char *)sfe->nameval,
+                         * didn't fit it all in the buffer after all.
-                                                   (int)sfe->namelen,
+                         */
-                                                   (int)sfe->valuelen))
+                        if (context->seen_enough)
-                                        break;
+                                break;
-                        }
+                        if (error)
+                                return error;
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
                xfs_attr_trace_l_c("sf big-gulp", context);
                return(0);
        }
+        /* do no more for a search callback */
+        if (context->bufsize == 0)
+                return 0;
        /*
         * It didn't all fit, so we have to sort everything on hashval.
         */
@@ -614,15 +643,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                        kmem_free(sbuf, sbsize);
                        return XFS_ERROR(EFSCORRUPTED);
                }
-                if (((context->flags & ATTR_SECURE) != 0) !=
+                if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
-                    ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
-                    !(context->flags & ATTR_KERNORMALS)) {
-                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-                        continue;
-                }
-                if (((context->flags & ATTR_ROOT) != 0) !=
-                    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
-                    !(context->flags & ATTR_KERNROOTLS)) {
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                        continue;
                }
@@ -671,24 +692,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        for ( ; i < nsbuf; i++, sbp++) {
                attrnames_t     *namesp;
-                namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure :
+                namesp = xfs_attr_flags_namesp(sbp->flags);
-                        ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted :
-                          &attr_user);
                if (cursor->hashval != sbp->hash) {
                        cursor->hashval = sbp->hash;
                        cursor->offset = 0;
                }
-                if (context->flags & ATTR_KERNOVAL) {
+                error = context->put_listent(context,
-                        ASSERT(context->flags & ATTR_KERNAMELS);
+                                        namesp,
-                        context->count += namesp->attr_namelen +
+                                        sbp->name,
-                                                sbp->namelen + 1;
+                                        sbp->namelen,
-                } else {
+                                        sbp->valuelen,
-                        if (xfs_attr_put_listent(context, namesp,
+                                        &sbp->name[sbp->namelen]);
-                                        sbp->name, sbp->namelen,
+                if (error)
-                                        sbp->valuelen))
+                        return error;
-                                break;
+                if (context->seen_enough)
-                }
+                        break;
                cursor->offset++;
        }
@@ -810,8 +829,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
                nargs.value = (char *)&name_loc->nameval[nargs.namelen];
                nargs.valuelen = be16_to_cpu(name_loc->valuelen);
                nargs.hashval = be32_to_cpu(entry->hashval);
-                nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
+                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
-                              ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
                xfs_attr_shortform_add(&nargs, forkoff);
        }
        error = 0;
@@ -1098,8 +1116,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
                                     be16_to_cpu(map->size));
        entry->hashval = cpu_to_be32(args->hashval);
        entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
-        entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
+        entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-                        ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
        if (args->rename) {
                entry->flags |= XFS_ATTR_INCOMPLETE;
                if ((args->blkno2 == args->blkno) &&
@@ -1926,7 +1943,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
                else
                        break;
        }
-        ASSERT((probe >= 0) && 
+        ASSERT((probe >= 0) &&
               (!leaf->hdr.count
               || (probe < be16_to_cpu(leaf->hdr.count))));
        ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
@@ -1971,14 +1988,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
                        name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
                        if (name_loc->namelen != args->namelen)
                                continue;
-                        if (memcmp(args->name, (char *)name_loc->nameval,
+                        if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
-                                             args->namelen) != 0)
                                continue;
-                        if (((args->flags & ATTR_SECURE) != 0) !=
+                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
-                            ((entry->flags & XFS_ATTR_SECURE) != 0))
-                                continue;
-                        if (((args->flags & ATTR_ROOT) != 0) !=
-                            ((entry->flags & XFS_ATTR_ROOT) != 0))
                                continue;
                        args->index = probe;
                        return(XFS_ERROR(EEXIST));
@@ -1989,11 +2001,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
                        if (memcmp(args->name, (char *)name_rmt->name,
                                             args->namelen) != 0)
                                continue;
-                        if (((args->flags & ATTR_SECURE) != 0) !=
+                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
-                            ((entry->flags & XFS_ATTR_SECURE) != 0))
-                                continue;
-                        if (((args->flags & ATTR_ROOT) != 0) !=
-                            ((entry->flags & XFS_ATTR_ROOT) != 0))
                                continue;
                        args->index = probe;
                        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
@@ -2312,8 +2320,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
        attrlist_cursor_kern_t *cursor;
        xfs_attr_leafblock_t *leaf;
        xfs_attr_leaf_entry_t *entry;
-        xfs_attr_leaf_name_local_t *name_loc;
-        xfs_attr_leaf_name_remote_t *name_rmt;
        int retval, i;
        ASSERT(bp != NULL);
@@ -2355,9 +2361,8 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
         * We have found our place, start copying out the new attributes.
         */
        retval = 0;
-        for (  ; (i < be16_to_cpu(leaf->hdr.count))
+        for (  ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
-             && (retval == 0); entry++, i++) {
+                attrnames_t *namesp;
-                attrnames_t     *namesp;
                if (be32_to_cpu(entry->hashval) != cursor->hashval) {
                        cursor->hashval = be32_to_cpu(entry->hashval);
@@ -2366,115 +2371,69 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
                if (entry->flags & XFS_ATTR_INCOMPLETE)
                        continue;               /* skip incomplete entries */
-                if (((context->flags & ATTR_SECURE) != 0) !=
+                if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags))
-                    ((entry->flags & XFS_ATTR_SECURE) != 0) &&
+                        continue;
-                    !(context->flags & ATTR_KERNORMALS))
-                        continue;               /* skip non-matching entries */
+                namesp = xfs_attr_flags_namesp(entry->flags);
-                if (((context->flags & ATTR_ROOT) != 0) !=
-                    ((entry->flags & XFS_ATTR_ROOT) != 0) &&
-                    !(context->flags & ATTR_KERNROOTLS))
-                        continue;               /* skip non-matching entries */
-                namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure :
-                        ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted :
-                          &attr_user);
                if (entry->flags & XFS_ATTR_LOCAL) {
-                        name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                        xfs_attr_leaf_name_local_t *name_loc =
-                        if (context->flags & ATTR_KERNOVAL) {
+                                XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
-                                ASSERT(context->flags & ATTR_KERNAMELS);
-                                context->count += namesp->attr_namelen +
+                        retval = context->put_listent(context,
-                                                (int)name_loc->namelen + 1;
+                                                namesp,
-                        } else {
+                                                (char *)name_loc->nameval,
-                                retval = xfs_attr_put_listent(context, namesp,
+                                                (int)name_loc->namelen,
-                                        (char *)name_loc->nameval,
+                                                be16_to_cpu(name_loc->valuelen),
-                                        (int)name_loc->namelen,
+                                                (char *)&name_loc->nameval[name_loc->namelen]);
-                                        be16_to_cpu(name_loc->valuelen));
+                        if (retval)
-                        }
+                                return retval;
                } else {
-                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                        xfs_attr_leaf_name_remote_t *name_rmt =
-                        if (context->flags & ATTR_KERNOVAL) {
+                                XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
-                                ASSERT(context->flags & ATTR_KERNAMELS);
-                                context->count += namesp->attr_namelen +
+                        int valuelen = be32_to_cpu(name_rmt->valuelen);
-                                                (int)name_rmt->namelen + 1;
-                        } else {
+                        if (context->put_value) {
-                                retval = xfs_attr_put_listent(context, namesp,
+                                xfs_da_args_t args;
-                                        (char *)name_rmt->name,
-                                        (int)name_rmt->namelen,
+                                memset((char *)&args, 0, sizeof(args));
-                                        be32_to_cpu(name_rmt->valuelen));
+                                args.dp = context->dp;
+                                args.whichfork = XFS_ATTR_FORK;
+                                args.valuelen = valuelen;
+                                args.value = kmem_alloc(valuelen, KM_SLEEP);
+                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+                                args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+                                retval = xfs_attr_rmtval_get(&args);
+                                if (retval)
+                                        return retval;
+                                retval = context->put_listent(context,
+                                                namesp,
+                                                (char *)name_rmt->name,
+                                                (int)name_rmt->namelen,
+                                                valuelen,
+                                                (char*)args.value);
+                                kmem_free(args.value, valuelen);
                        }
+                        else {
+                                retval = context->put_listent(context,
+                                                namesp,
+                                                (char *)name_rmt->name,
+                                                (int)name_rmt->namelen,
+                                                valuelen,
+                                                NULL);
+                        }
+                        if (retval)
+                                return retval;
                }
-                if (retval == 0) {
+                if (context->seen_enough)
-                        cursor->offset++;
+                        break;
-                }
+                cursor->offset++;
        }
        xfs_attr_trace_l_cl("blk end", context, leaf);
        return(retval);
 }
-#define ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
-        (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
-        ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
-         & ~(sizeof(u_int32_t)-1))
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context,
-                     attrnames_t *namesp, char *name, int namelen, int valuelen)
-{
-        attrlist_ent_t *aep;
-        int arraytop;
-        ASSERT(!(context->flags & ATTR_KERNOVAL));
-        if (context->flags & ATTR_KERNAMELS) {
-                char *offset;
-                ASSERT(context->count >= 0);
-                arraytop = context->count + namesp->attr_namelen + namelen + 1;
-                if (arraytop > context->firstu) {
-                        context->count = -1;    /* insufficient space */
-                        return(1);
-                }
-                offset = (char *)context->alist + context->count;
-                strncpy(offset, namesp->attr_name, namesp->attr_namelen);
-                offset += namesp->attr_namelen;
-                strncpy(offset, name, namelen);                 /* real name */
-                offset += namelen;
-                *offset = '\0';
-                context->count += namesp->attr_namelen + namelen + 1;
-                return(0);
-        }
-        ASSERT(context->count >= 0);
-        ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
-        ASSERT(context->firstu >= sizeof(*context->alist));
-        ASSERT(context->firstu <= context->bufsize);
-        arraytop = sizeof(*context->alist) +
-                        context->count * sizeof(context->alist->al_offset[0]);
-        context->firstu -= ATTR_ENTSIZE(namelen);
-        if (context->firstu < arraytop) {
-                xfs_attr_trace_l_c("buffer full", context);
-                context->alist->al_more = 1;
-                return(1);
-        }
-        aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
-        aep->a_valuelen = valuelen;
-        memcpy(aep->a_name, name, namelen);
-        aep->a_name[ namelen ] = 0;
-        context->alist->al_offset[ context->count++ ] = context->firstu;
-        context->alist->al_count = context->count;
-        xfs_attr_trace_l_c("add", context);
-        return(0);
-}
 /*========================================================================
 * Manage the INCOMPLETE flag in a leaf entry
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 51c3ee156b2f..040f732ce1e2 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -130,6 +130,19 @@ typedef struct xfs_attr_leafblock {
 #define XFS_ATTR_INCOMPLETE     (1 << XFS_ATTR_INCOMPLETE_BIT)
 /*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK          (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK        (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)      ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)        ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)  (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+                                         ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)  (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+                                         ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+/*
 * Alignment for namelist and valuelist entries (since they are mixed
 * there can be only one alignment value)
 */
@@ -196,16 +209,26 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize)
 * Structure used to pass context around among the routines.
 *========================================================================*/
+struct xfs_attr_list_context;
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *,
+                                      char *, int, int, char *);
 typedef struct xfs_attr_list_context {
-        struct xfs_inode                *dp;    /* inode */
+        struct xfs_inode                *dp;            /* inode */
-        struct attrlist_cursor_kern     *cursor;/* position in list */
+        struct attrlist_cursor_kern     *cursor;        /* position in list */
-        struct attrlist                 *alist; /* output buffer */
+        struct attrlist                 *alist;         /* output buffer */
-        int                             count;  /* num used entries */
+        int                             seen_enough;    /* T/F: seen enough of list? */
-        int                             dupcnt; /* count dup hashvals seen */
+        int                             count;          /* num used entries */
-        int                             bufsize;/* total buffer size */
+        int                             dupcnt;         /* count dup hashvals seen */
-        int                             firstu; /* first used byte in buffer */
+        int                             bufsize;        /* total buffer size */
-        int                             flags;  /* from VOP call */
+        int                             firstu;         /* first used byte in buffer */
-        int                             resynch;/* T/F: resynch with cursor */
+        int                             flags;          /* from VOP call */
+        int                             resynch;        /* T/F: resynch with cursor */
+        int                             put_value;      /* T/F: need value for listent */
+        put_listent_func_t              put_listent;    /* list output fmt function */
+        int                             index;          /* index into output buffer */
 } xfs_attr_list_context_t;
 /*
diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c
index f4fe3715a803..0dc17219d412 100644
--- a/fs/xfs/xfs_behavior.c
+++ b/fs/xfs/xfs_behavior.c
@@ -110,26 +110,6 @@ bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
 }
 /*
- * Look for a specific ops vector on the specified behavior chain.
- * Return the associated behavior descriptor.  Or NULL, if not found.
- */
-bhv_desc_t *
-bhv_lookup(bhv_head_t *bhp, void *ops)
-{
-        bhv_desc_t      *curdesc;
-        for (curdesc = bhp->bh_first;
-             curdesc != NULL;
-             curdesc = curdesc->bd_next) {
-                if (curdesc->bd_ops == ops)
-                        return curdesc;
-        }
-        return NULL;
-}
-/*
 * Looks for the first behavior within a specified range of positions.
 * Return the associated behavior descriptor.  Or NULL, if none found.
 */
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
index 6e6e56fb352d..e7ca1fed955a 100644
--- a/fs/xfs/xfs_behavior.h
+++ b/fs/xfs/xfs_behavior.h
@@ -176,12 +176,10 @@ extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
 * Behavior module prototypes.
 */
 extern void             bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
-extern bhv_desc_t *     bhv_lookup(bhv_head_t *bhp, void *ops);
 extern bhv_desc_t *     bhv_lookup_range(bhv_head_t *bhp, int low, int high);
 extern bhv_desc_t *     bhv_base(bhv_head_t *bhp);
 /* No bhv locking on Linux */
-#define bhv_lookup_unlocked     bhv_lookup
 #define bhv_base_unlocked       bhv_base
 #endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index bf46fae303af..5b050c06795f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2999,7 +2999,7 @@ xfs_bmap_btree_to_extents(
        int                     error;  /* error return value */
        xfs_ifork_t             *ifp;   /* inode fork data */
        xfs_mount_t             *mp;    /* mount point structure */
-        xfs_bmbt_ptr_t          *pp;    /* ptr to block address */
+        __be64                  *pp;    /* ptr to block address */
        xfs_bmbt_block_t        *rblock;/* root btree block */
        ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -3011,12 +3011,12 @@ xfs_bmap_btree_to_extents(
        ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
        mp = ip->i_mount;
        pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+        cbno = be64_to_cpu(*pp);
        *logflagsp = 0;
 #ifdef DEBUG
-        if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
+        if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
                return error;
 #endif
-        cbno = INT_GET(*pp, ARCH_CONVERT);
        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
                        XFS_BMAP_BTREE_REF)))
                return error;
@@ -3512,9 +3512,9 @@ xfs_bmap_extents_to_btree(
         */
        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
-        INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp));
+        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+        *pp = cpu_to_be64(args.fsbno);
        /*
         * Do all this logging at the end so that
         * the root is at the right level.
@@ -3705,7 +3705,7 @@ STATIC xfs_bmbt_rec_t *                 /* pointer to found extent entry */
 xfs_bmap_search_extents(
        xfs_inode_t     *ip,            /* incore inode pointer */
        xfs_fileoff_t   bno,            /* block number searched for */
-        int             whichfork,      /* data or attr fork */
+        int             fork,           /* data or attr fork */
        int             *eofp,          /* out: end of file found */
        xfs_extnum_t    *lastxp,        /* out: last extent index */
        xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
@@ -3713,25 +3713,28 @@ xfs_bmap_search_extents(
 {
        xfs_ifork_t     *ifp;           /* inode fork pointer */
        xfs_bmbt_rec_t  *ep;            /* extent record pointer */
-        int             rt;             /* realtime flag    */
        XFS_STATS_INC(xs_look_exlist);
-        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ifp = XFS_IFORK_PTR(ip, fork);
        ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-        rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+        if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
-        if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) {
+                     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-                cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
+                xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
+                                "Access to block zero in inode %llu "
-                        "extent-state : %x \n",
+                                "start_block: %llx start_off: %llx "
-                        (ip->i_mount)->m_fsname, (long long)ip->i_ino,
+                                "blkcnt: %llx extent-state: %x lastx: %x\n",
+                        (unsigned long long)ip->i_ino,
                        (unsigned long long)gotp->br_startblock,
                        (unsigned long long)gotp->br_startoff,
                        (unsigned long long)gotp->br_blockcount,
-                        gotp->br_state);
+                        gotp->br_state, *lastxp);
-        }
+                *lastxp = NULLEXTNUM;
-        return ep;
+                *eofp = 1;
+                return NULL;
+        }
+        return ep;
 }
@@ -4494,7 +4497,7 @@ xfs_bmap_read_extents(
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
        xfs_mount_t             *mp;    /* file system mount structure */
-        xfs_bmbt_ptr_t          *pp;    /* pointer to block address */
+        __be64                  *pp;    /* pointer to block address */
        /* REFERENCED */
        xfs_extnum_t            room;   /* number of entries there's room for */
@@ -4510,10 +4513,10 @@ xfs_bmap_read_extents(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
-        ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+        bno = be64_to_cpu(*pp);
-        ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+        ASSERT(bno != NULLDFSBNO);
-        ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-        bno = INT_GET(*pp, ARCH_CONVERT);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
        /*
         * Go down the tree until leaf level is reached, following the first
         * pointer (leftmost) at each level.
@@ -4530,10 +4533,8 @@ xfs_bmap_read_extents(
                        break;
                pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
                        1, mp->m_bmap_dmxr[1]);
-                XFS_WANT_CORRUPTED_GOTO(
+                bno = be64_to_cpu(*pp);
-                        XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
+                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-                        error0);
-                bno = INT_GET(*pp, ARCH_CONVERT);
                xfs_trans_brelse(tp, bp);
        }
        /*
@@ -6141,7 +6142,7 @@ xfs_check_block(
        short                   sz)
 {
        int                     i, j, dmxr;
-        xfs_bmbt_ptr_t          *pp, *thispa;   /* pointer to block address */
+        __be64                  *pp, *thispa;   /* pointer to block address */
        xfs_bmbt_key_t          *prevp, *keyp;
        ASSERT(be16_to_cpu(block->bb_level) > 0);
@@ -6179,11 +6180,10 @@ xfs_check_block(
                                thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
                                        xfs_bmbt, block, j, dmxr);
                        }
-                        if (INT_GET(*thispa, ARCH_CONVERT) ==
+                        if (*thispa == *pp) {
-                            INT_GET(*pp, ARCH_CONVERT)) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
                                        __FUNCTION__, j, i,
-                                        INT_GET(*thispa, ARCH_CONVERT));
+                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
                                        __FUNCTION__);
                        }
@@ -6210,7 +6210,7 @@ xfs_bmap_check_leaf_extents(
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
        xfs_mount_t             *mp;    /* file system mount structure */
-        xfs_bmbt_ptr_t          *pp;    /* pointer to block address */
+        __be64                  *pp;    /* pointer to block address */
        xfs_bmbt_rec_t          *ep;    /* pointer to current extent */
        xfs_bmbt_rec_t          *lastp; /* pointer to previous extent */
        xfs_bmbt_rec_t          *nextp; /* pointer to next extent */
@@ -6231,10 +6231,12 @@ xfs_bmap_check_leaf_extents(
        ASSERT(level > 0);
        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
-        ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+        bno = be64_to_cpu(*pp);
-        ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
-        ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+        ASSERT(bno != NULLDFSBNO);
-        bno = INT_GET(*pp, ARCH_CONVERT);
+        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
        /*
         * Go down the tree until leaf level is reached, following the first
         * pointer (leftmost) at each level.
@@ -6265,8 +6267,8 @@ xfs_bmap_check_leaf_extents(
                xfs_check_block(block, mp, 0, 0);
                pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
                        1, mp->m_bmap_dmxr[1]);
-                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
+                bno = be64_to_cpu(*pp);
-                bno = INT_GET(*pp, ARCH_CONVERT);
+                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
                        bp_release = 0;
                        xfs_trans_brelse(NULL, bp);
@@ -6372,7 +6374,7 @@ xfs_bmap_count_blocks(
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
        xfs_mount_t             *mp;    /* file system mount structure */
-        xfs_bmbt_ptr_t          *pp;    /* pointer to block address */
+        __be64                  *pp;    /* pointer to block address */
        bno = NULLFSBLOCK;
        mp = ip->i_mount;
@@ -6395,10 +6397,10 @@ xfs_bmap_count_blocks(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
-        ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+        bno = be64_to_cpu(*pp);
-        ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+        ASSERT(bno != NULLDFSBNO);
-        ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-        bno = INT_GET(*pp, ARCH_CONVERT);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
        if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
                XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
@@ -6425,7 +6427,7 @@ xfs_bmap_count_tree(
        int                     error;
        xfs_buf_t               *bp, *nbp;
        int                     level = levelin;
-        xfs_bmbt_ptr_t          *pp;
+        __be64                  *pp;
        xfs_fsblock_t           bno = blockno;
        xfs_fsblock_t           nextbno;
        xfs_bmbt_block_t        *block, *nextblock;
@@ -6452,7 +6454,7 @@ xfs_bmap_count_tree(
                /* Dive to the next level */
                pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
                        xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
-                bno = INT_GET(*pp, ARCH_CONVERT);
+                bno = be64_to_cpu(*pp);
                if (unlikely((error =
                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
                        xfs_trans_brelse(tp, bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 18fb7385d719..a7b835bf870a 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -58,7 +58,7 @@ STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
 STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
 STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
 STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-                xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
+                __uint64_t *, xfs_btree_cur_t **, int *);
 STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
@@ -192,16 +192,11 @@ xfs_bmbt_trace_argifk(
        xfs_btree_cur_t         *cur,
        int                     i,
        xfs_fsblock_t           f,
-        xfs_bmbt_key_t          *k,
+        xfs_dfiloff_t           o,
        int                     line)
 {
-        xfs_dfsbno_t            d;
-        xfs_dfiloff_t           o;
-        d = (xfs_dfsbno_t)f;
-        o = INT_GET(k->br_startoff, ARCH_CONVERT);
        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, d >> 32, (int)d, o >> 32,
+                i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
                (int)o, 0, 0, 0,
                0, 0, 0);
 }
@@ -248,7 +243,7 @@ xfs_bmbt_trace_argik(
 {
        xfs_dfiloff_t           o;
-        o = INT_GET(k->br_startoff, ARCH_CONVERT);
+        o = be64_to_cpu(k->br_startoff);
        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
                i, o >> 32, (int)o, 0,
                0, 0, 0, 0,
@@ -286,8 +281,8 @@ xfs_bmbt_trace_cursor(
        xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
 #define XFS_BMBT_TRACE_ARGI(c,i)        \
        xfs_bmbt_trace_argi(fname, c, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)  \
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-        xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
+        xfs_bmbt_trace_argifk(fname, c, i, f, s, __LINE__)
 #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
        xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
 #define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
@@ -299,7 +294,7 @@ xfs_bmbt_trace_cursor(
 #define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
 #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
 #define XFS_BMBT_TRACE_ARGI(c,i)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
 #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
 #define XFS_BMBT_TRACE_ARGIK(c,i,k)
 #define XFS_BMBT_TRACE_CURSOR(c,s)
@@ -357,7 +352,7 @@ xfs_bmbt_delrec(
        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
        XFS_BMBT_TRACE_ARGI(cur, level);
        ptr = cur->bc_ptrs[level];
-        tcur = (xfs_btree_cur_t *)0;
+        tcur = NULL;
        if (ptr == 0) {
                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
                *stat = 0;
@@ -382,7 +377,7 @@ xfs_bmbt_delrec(
                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
 #ifdef DEBUG
                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                goto error0;
                        }
@@ -404,7 +399,8 @@ xfs_bmbt_delrec(
                        xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
                }
                if (ptr == 1) {
-                        INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp));
+                        key.br_startoff =
+                                cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
                        kp = &key;
                }
        }
@@ -621,7 +617,7 @@ xfs_bmbt_delrec(
                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
                for (i = 0; i < numrrecs; i++) {
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                goto error0;
                        }
@@ -748,7 +744,7 @@ xfs_bmbt_insrec(
        int                     logflags;       /* inode logging flags */
        xfs_fsblock_t           nbno;           /* new block number */
        struct xfs_btree_cur    *ncur;          /* new btree cursor */
-        xfs_bmbt_key_t          nkey;           /* new btree key value */
+        __uint64_t              startoff;       /* new btree key value */
        xfs_bmbt_rec_t          nrec;           /* new record count */
        int                     optr;           /* old key/record index */
        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
@@ -759,9 +755,8 @@ xfs_bmbt_insrec(
        ASSERT(level < cur->bc_nlevels);
        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
        XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-        ncur = (xfs_btree_cur_t *)0;
+        ncur = NULL;
-        INT_SET(key.br_startoff, ARCH_CONVERT,
+        key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
-                xfs_bmbt_disk_get_startoff(recp));
        optr = ptr = cur->bc_ptrs[level];
        if (ptr == 0) {
                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
@@ -820,7 +815,7 @@ xfs_bmbt_insrec(
                                        optr = ptr = cur->bc_ptrs[level];
                                } else {
                                        if ((error = xfs_bmbt_split(cur, level,
-                                                        &nbno, &nkey, &ncur,
+                                                        &nbno, &startoff, &ncur,
                                                        &i))) {
                                                XFS_BMBT_TRACE_CURSOR(cur,
                                                        ERROR);
@@ -840,7 +835,7 @@ xfs_bmbt_insrec(
 #endif
                                                ptr = cur->bc_ptrs[level];
                                                xfs_bmbt_disk_set_allf(&nrec,
-                                                        nkey.br_startoff, 0, 0,
+                                                        startoff, 0, 0,
                                                        XFS_EXT_NORM);
                                        } else {
                                                XFS_BMBT_TRACE_CURSOR(cur,
@@ -858,7 +853,7 @@ xfs_bmbt_insrec(
                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
 #ifdef DEBUG
                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
+                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
                                        level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                return error;
@@ -870,14 +865,13 @@ xfs_bmbt_insrec(
                memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
                        (numrecs - ptr + 1) * sizeof(*pp));
 #ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
+                if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
-                                level))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                        return error;
                }
 #endif
                kp[ptr - 1] = key;
-                INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+                pp[ptr - 1] = cpu_to_be64(*bnop);
                numrecs++;
                block->bb_numrecs = cpu_to_be16(numrecs);
                xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
@@ -988,7 +982,7 @@ xfs_bmbt_killroot(
        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
 #ifdef DEBUG
        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
+                if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                        return error;
                }
@@ -1132,7 +1126,7 @@ xfs_bmbt_lookup(
                        d = XFS_FSB_TO_DADDR(mp, fsbno);
                        bp = cur->bc_bufs[level];
                        if (bp && XFS_BUF_ADDR(bp) != d)
-                                bp = (xfs_buf_t *)0;
+                                bp = NULL;
                        if (!bp) {
                                if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
                                                0, &bp, XFS_BMAP_BTREE_REF))) {
@@ -1170,7 +1164,7 @@ xfs_bmbt_lookup(
                                keyno = (low + high) >> 1;
                                if (level > 0) {
                                        kkp = kkbase + keyno - 1;
-                                        startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
+                                        startoff = be64_to_cpu(kkp->br_startoff);
                                } else {
                                        krp = krbase + keyno - 1;
                                        startoff = xfs_bmbt_disk_get_startoff(krp);
@@ -1189,13 +1183,13 @@ xfs_bmbt_lookup(
                        if (diff > 0 && --keyno < 1)
                                keyno = 1;
                        pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
+                        fsbno = be64_to_cpu(*pp);
 #ifdef DEBUG
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+                        if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                return error;
                        }
 #endif
-                        fsbno = INT_GET(*pp, ARCH_CONVERT);
                        cur->bc_ptrs[level] = keyno;
                }
        }
@@ -1313,7 +1307,7 @@ xfs_bmbt_lshift(
                lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
+                if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                        return error;
                }
@@ -1340,7 +1334,7 @@ xfs_bmbt_lshift(
        if (level > 0) {
 #ifdef DEBUG
                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
                                        level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                return error;
@@ -1354,8 +1348,7 @@ xfs_bmbt_lshift(
        } else {
                memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
                xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-                INT_SET(key.br_startoff, ARCH_CONVERT,
+                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                        xfs_bmbt_disk_get_startoff(rrp));
                rkp = &key;
        }
        if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
@@ -1445,7 +1438,7 @@ xfs_bmbt_rshift(
                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                return error;
                        }
@@ -1454,7 +1447,7 @@ xfs_bmbt_rshift(
                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
 #ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
+                if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                        return error;
                }
@@ -1469,8 +1462,7 @@ xfs_bmbt_rshift(
                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
                *rrp = *lrp;
                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                INT_SET(key.br_startoff, ARCH_CONVERT,
+                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                        xfs_bmbt_disk_get_startoff(rrp));
                rkp = &key;
        }
        be16_add(&left->bb_numrecs, -1);
@@ -1535,7 +1527,7 @@ xfs_bmbt_split(
        xfs_btree_cur_t         *cur,
        int                     level,
        xfs_fsblock_t           *bnop,
-        xfs_bmbt_key_t          *keyp,
+        __uint64_t              *startoff,
        xfs_btree_cur_t         **curp,
        int                     *stat)          /* success/failure */
 {
@@ -1560,7 +1552,7 @@ xfs_bmbt_split(
        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
+        XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
        args.tp = cur->bc_tp;
        args.mp = cur->bc_mp;
        lbp = cur->bc_bufs[level];
@@ -1619,7 +1611,7 @@ xfs_bmbt_split(
                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
+                        if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                                return error;
                        }
@@ -1629,13 +1621,13 @@ xfs_bmbt_split(
                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
+                *startoff = be64_to_cpu(rkp->br_startoff);
        } else {
                lrp = XFS_BMAP_REC_IADDR(left, i, cur);
                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp);
+                *startoff = xfs_bmbt_disk_get_startoff(rrp);
        }
        be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
        right->bb_rightsib = left->bb_rightsib;
@@ -1728,9 +1720,9 @@ xfs_bmdr_to_bmbt(
 {
        int                     dmxr;
        xfs_bmbt_key_t          *fkp;
-        xfs_bmbt_ptr_t          *fpp;
+        __be64                  *fpp;
        xfs_bmbt_key_t          *tkp;
-        xfs_bmbt_ptr_t          *tpp;
+        __be64                  *tpp;
        rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        rblock->bb_level = dblock->bb_level;
@@ -1745,7 +1737,7 @@ xfs_bmdr_to_bmbt(
        tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-        memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
@@ -1805,7 +1797,7 @@ xfs_bmbt_decrement(
        tp = cur->bc_tp;
        mp = cur->bc_mp;
        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
                                XFS_BMAP_BTREE_REF))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
@@ -2135,7 +2127,7 @@ xfs_bmbt_increment(
        tp = cur->bc_tp;
        mp = cur->bc_mp;
        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
                                XFS_BMAP_BTREE_REF))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
@@ -2178,7 +2170,7 @@ xfs_bmbt_insert(
        level = 0;
        nbno = NULLFSBLOCK;
        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-        ncur = (xfs_btree_cur_t *)0;
+        ncur = NULL;
        pcur = cur;
        do {
                if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
@@ -2205,7 +2197,7 @@ xfs_bmbt_insert(
                }
                if (ncur) {
                        pcur = ncur;
-                        ncur = (xfs_btree_cur_t *)0;
+                        ncur = NULL;
                }
        } while (nbno != NULLFSBLOCK);
        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
@@ -2356,12 +2348,12 @@ xfs_bmbt_newroot(
        args.firstblock = args.fsbno;
        if (args.fsbno == NULLFSBLOCK) {
 #ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+                if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                        return error;
                }
 #endif
-                args.fsbno = INT_GET(*pp, ARCH_CONVERT);
+                args.fsbno = be64_to_cpu(*pp);
                args.type = XFS_ALLOCTYPE_START_BNO;
        } else
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -2393,7 +2385,7 @@ xfs_bmbt_newroot(
        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
 #ifdef DEBUG
        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+                if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                        return error;
                }
@@ -2401,13 +2393,12 @@ xfs_bmbt_newroot(
 #endif
        memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
 #ifdef DEBUG
-        if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
+        if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
-                        level))) {
                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                return error;
        }
 #endif
-        INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+        *pp = cpu_to_be64(args.fsbno);
        xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
                cur->bc_private.b.whichfork);
        xfs_btree_setbuf(cur, level, bp);
@@ -2681,9 +2672,9 @@ xfs_bmbt_to_bmdr(
 {
        int                     dmxr;
        xfs_bmbt_key_t          *fkp;
-        xfs_bmbt_ptr_t          *fpp;
+        __be64                  *fpp;
        xfs_bmbt_key_t          *tkp;
-        xfs_bmbt_ptr_t          *tpp;
+        __be64                  *tpp;
        ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
        ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
@@ -2698,7 +2689,7 @@ xfs_bmbt_to_bmdr(
        tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-        memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
@@ -2740,7 +2731,7 @@ xfs_bmbt_update(
                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
                return 0;
        }
-        INT_SET(key.br_startoff, ARCH_CONVERT, off);
+        key.br_startoff = cpu_to_be64(off);
        if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
                return error;
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 6478cfa0e539..49539de9525b 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -163,13 +163,14 @@ typedef struct xfs_bmbt_irec
 /*
 * Key structure for non-leaf levels of the tree.
 */
-typedef struct xfs_bmbt_key
+typedef struct xfs_bmbt_key {
-{
+        __be64          br_startoff;    /* starting file offset */
-        xfs_dfiloff_t   br_startoff;    /* starting file offset */
 } xfs_bmbt_key_t, xfs_bmdr_key_t;
-typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;    /* btree pointer type */
+/* btree pointer type */
-                                        /* btree block header type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+/* btree block header type */
 typedef struct xfs_btree_lblock xfs_bmbt_block_t;
 #define XFS_BUF_TO_BMBT_BLOCK(bp)       ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index ee2255bd6562..aeb87ca69fcc 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -161,7 +161,7 @@ xfs_btree_check_key(
                k1 = ak1;
                k2 = ak2;
-                ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
+                ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
                break;
            }
        case XFS_BTNUM_INO: {
@@ -170,7 +170,7 @@ xfs_btree_check_key(
                k1 = ak1;
                k2 = ak2;
-                ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
+                ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
                break;
            }
        default:
@@ -285,8 +285,8 @@ xfs_btree_check_rec(
                r1 = ar1;
                r2 = ar2;
-                ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
+                ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
-                       INT_GET(r2->ir_startino, ARCH_CONVERT));
+                       be32_to_cpu(r2->ir_startino));
                break;
            }
        default:
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 44f1bd98064a..892b06c54263 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -145,7 +145,7 @@ typedef struct xfs_btree_cur
        union {
                xfs_alloc_rec_incore_t  a;
                xfs_bmbt_irec_t         b;
-                xfs_inobt_rec_t         i;
+                xfs_inobt_rec_incore_t  i;
        }               bc_rec;         /* current insert/search record value */
        struct xfs_buf  *bc_bufs[XFS_BTREE_MAXLEVELS];  /* buf ptr per level */
        int             bc_ptrs[XFS_BTREE_MAXLEVELS];   /* key/record # */
@@ -243,6 +243,9 @@ xfs_btree_check_lptr(
        xfs_dfsbno_t            ptr,    /* btree block disk address */
        int                     level); /* btree block level */
+#define xfs_btree_check_lptr_disk(cur, ptr, level) \
+        xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
 /*
 * Checking routine: check that short form block header is ok.
 */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a4aa53974f76..7a55c248ea70 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -234,7 +234,6 @@ xfs_buf_item_format(
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
        bp = bip->bli_buf;
-        ASSERT(XFS_BUF_BP_ISMAPPED(bp));
        vecp = log_vector;
        /*
@@ -628,25 +627,6 @@ xfs_buf_item_committed(
 }
 /*
- * This is called when the transaction holding the buffer is aborted.
- * Just behave as if the transaction had been cancelled. If we're shutting down
- * and have aborted this transaction, we'll trap this buffer when it tries to
- * get written out.
- */
-STATIC void
-xfs_buf_item_abort(
-        xfs_buf_log_item_t      *bip)
-{
-        xfs_buf_t       *bp;
-        bp = bip->bli_buf;
-        xfs_buftrace("XFS_ABORT", bp);
-        XFS_BUF_SUPER_STALE(bp);
-        xfs_buf_item_unlock(bip);
-        return;
-}
-/*
 * This is called to asynchronously write the buffer associated with this
 * buf log item out to disk. The buffer will already have been locked by
 * a successful call to xfs_buf_item_trylock().  If the buffer still has
@@ -693,7 +673,6 @@ STATIC struct xfs_item_ops xfs_buf_item_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_buf_item_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
        .iop_pushbuf    = NULL,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_buf_item_committing
@@ -901,7 +880,6 @@ xfs_buf_item_relse(
        XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
        if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
            (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
-                ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
                XFS_BUF_CLR_IODONE_FUNC(bp);
        }
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 32ab61d17ace..a68bc1f1a313 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1054,7 +1054,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
        xfs_da_node_entry_t *btree;
        xfs_dablk_t blkno;
        int probe, span, max, error, retval;
-        xfs_dahash_t hashval;
+        xfs_dahash_t hashval, btreehashval;
        xfs_da_args_t *args;
        args = state->args;
@@ -1079,30 +1079,32 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                        return(error);
                }
                curr = blk->bp->data;
-                ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC ||
+                blk->magic = be16_to_cpu(curr->magic);
-                       be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC ||
+                ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
-                       be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC);
+                       blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+                       blk->magic == XFS_ATTR_LEAF_MAGIC);
                /*
                 * Search an intermediate node for a match.
                 */
-                blk->magic = be16_to_cpu(curr->magic);
                if (blk->magic == XFS_DA_NODE_MAGIC) {
                        node = blk->bp->data;
-                        blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+                        max = be16_to_cpu(node->hdr.count);
+                        btreehashval = node->btree[max-1].hashval;
+                        blk->hashval = be32_to_cpu(btreehashval);
                        /*
                         * Binary search.  (note: small blocks will skip loop)
                         */
-                        max = be16_to_cpu(node->hdr.count);
                        probe = span = max / 2;
                        hashval = args->hashval;
                        for (btree = &node->btree[probe]; span > 4;
                                   btree = &node->btree[probe]) {
                                span /= 2;
-                                if (be32_to_cpu(btree->hashval) < hashval)
+                                btreehashval = be32_to_cpu(btree->hashval);
+                                if (btreehashval < hashval)
                                        probe += span;
-                                else if (be32_to_cpu(btree->hashval) > hashval)
+                                else if (btreehashval > hashval)
                                        probe -= span;
                                else
                                        break;
@@ -1133,10 +1135,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                                blk->index = probe;
                                blkno = be32_to_cpu(btree->before);
                        }
-                } else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
+                } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
                        blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
                        break;
-                } else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
+                } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
                        blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
                        break;
                }
@@ -1152,11 +1154,13 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
                        retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
                                                        &blk->index, state);
-                }
+                } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-                else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
                        retval = xfs_attr_leaf_lookup_int(blk->bp, args);
                        blk->index = args->index;
                        args->blkno = blk->blkno;
+                } else {
+                        ASSERT(0);
+                        return XFS_ERROR(EFSCORRUPTED);
                }
                if (((retval == ENOENT) || (retval == ENOATTR)) &&
                    (blk->hashval == args->hashval)) {
@@ -1166,8 +1170,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                                return(error);
                        if (retval == 0) {
                                continue;
-                        }
+                        } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-                        else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
                                /* path_shift() gives ENOENT */
                                retval = XFS_ERROR(ENOATTR);
                        }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index bc43163456ef..0893e16b7d83 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,14 +18,6 @@
 #ifndef __XFS_ERROR_H__
 #define __XFS_ERROR_H__
-#define XFS_ERECOVER    1       /* Failure to recover log */
-#define XFS_ELOGSTAT    2       /* Failure to stat log in user space */
-#define XFS_ENOLOGSPACE 3       /* Reservation too large */
-#define XFS_ENOTSUP     4       /* Operation not supported */
-#define XFS_ENOLSN      5       /* Can't find the lsn you asked for */
-#define XFS_ENOTFOUND   6
-#define XFS_ENOTXFS     7       /* Not XFS filesystem */
 #ifdef DEBUG
 #define XFS_ERROR_NTRAP 10
 extern int      xfs_etrap[XFS_ERROR_NTRAP];
@@ -175,6 +167,7 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud);
 #define         XFS_PTAG_SHUTDOWN_CORRUPT       0x00000010
 #define         XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
 #define         XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
+#define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
 struct xfs_mount;
 /* PRINTFLIKE4 */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6cf6d8769b97..6dba78199faf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -33,9 +33,6 @@ kmem_zone_t	*xfs_efi_zone;
 kmem_zone_t     *xfs_efd_zone;
 STATIC void     xfs_efi_item_unlock(xfs_efi_log_item_t *);
-STATIC void     xfs_efi_item_abort(xfs_efi_log_item_t *);
-STATIC void     xfs_efd_item_abort(xfs_efd_log_item_t *);
 void
 xfs_efi_item_free(xfs_efi_log_item_t *efip)
@@ -184,7 +181,7 @@ STATIC void
 xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
 {
        if (efip->efi_item.li_flags & XFS_LI_ABORTED)
-                xfs_efi_item_abort(efip);
+                xfs_efi_item_free(efip);
        return;
 }
@@ -202,18 +199,6 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
 }
 /*
- * This is called when the transaction logging the EFI is aborted.
- * Free up the EFI and return.  No need to clean up the slot for
- * the item in the transaction.  That was done by the unpin code
- * which is called prior to this routine in the abort/fs-shutdown path.
- */
-STATIC void
-xfs_efi_item_abort(xfs_efi_log_item_t *efip)
-{
-        xfs_efi_item_free(efip);
-}
-/*
 * There isn't much you can do to push on an efi item.  It is simply
 * stuck waiting for all of its corresponding efd items to be
 * committed to disk.
@@ -255,7 +240,6 @@ STATIC struct xfs_item_ops xfs_efi_item_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_efi_item_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
        .iop_pushbuf    = NULL,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_efi_item_committing
@@ -386,33 +370,6 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
        }
 }
-/*
- * This is called when the transaction that should be committing the
- * EFD corresponding to the given EFI is aborted.  The committed and
- * canceled flags are used to coordinate the freeing of the EFI and
- * the references by the transaction that committed it.
- */
-STATIC void
-xfs_efi_cancel(
-        xfs_efi_log_item_t      *efip)
-{
-        xfs_mount_t     *mp;
-        SPLDECL(s);
-        mp = efip->efi_item.li_mountp;
-        AIL_LOCK(mp, s);
-        if (efip->efi_flags & XFS_EFI_COMMITTED) {
-                /*
-                 * xfs_trans_delete_ail() drops the AIL lock.
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
-                xfs_efi_item_free(efip);
-        } else {
-                efip->efi_flags |= XFS_EFI_CANCELED;
-                AIL_UNLOCK(mp, s);
-        }
-}
 STATIC void
 xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 {
@@ -514,7 +471,7 @@ STATIC void
 xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
 {
        if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
-                xfs_efd_item_abort(efdp);
+                xfs_efd_item_free(efdp);
        return;
 }
@@ -541,27 +498,6 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
 }
 /*
- * The transaction of which this EFD is a part has been aborted.
- * Inform its companion EFI of this fact and then clean up after
- * ourselves.  No need to clean up the slot for the item in the
- * transaction.  That was done by the unpin code which is called
- * prior to this routine in the abort/fs-shutdown path.
- */
-STATIC void
-xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
-{
-        /*
-         * If we got a log I/O error, it's always the case that the LR with the
-         * EFI got unpinned and freed before the EFD got aborted. So don't
-         * reference the EFI at all in that case.
-         */
-        if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
-                xfs_efi_cancel(efdp->efd_efip);
-        xfs_efd_item_free(efdp);
-}
-/*
 * There isn't much you can do to push on an efd item.  It is simply
 * stuck waiting for the log to be flushed to disk.
 */
@@ -602,7 +538,6 @@ STATIC struct xfs_item_ops xfs_efd_item_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_efd_item_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
        .iop_pushbuf    = NULL,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_efd_item_committing
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0ea45edaab03..2f049f63e85f 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,14 +33,16 @@ typedef struct xfs_extent {
 * conversion routine.
 */
+#ifndef HAVE_FORMAT32
 typedef struct xfs_extent_32 {
-        xfs_dfsbno_t    ext_start;
+        __uint64_t      ext_start;
-        xfs_extlen_t    ext_len;
+        __uint32_t      ext_len;
 } __attribute__((packed)) xfs_extent_32_t;
+#endif
 typedef struct xfs_extent_64 {
-        xfs_dfsbno_t    ext_start;
+        __uint64_t      ext_start;
-        xfs_extlen_t    ext_len;
+        __uint32_t      ext_len;
        __uint32_t      ext_pad;
 } xfs_extent_64_t;
@@ -50,25 +52,27 @@ typedef struct xfs_extent_64 {
 * size is given by efi_nextents.
 */
 typedef struct xfs_efi_log_format {
-        unsigned short          efi_type;       /* efi log item type */
+        __uint16_t              efi_type;       /* efi log item type */
-        unsigned short          efi_size;       /* size of this item */
+        __uint16_t              efi_size;       /* size of this item */
-        uint                    efi_nextents;   /* # extents to free */
+        __uint32_t              efi_nextents;   /* # extents to free */
        __uint64_t              efi_id;         /* efi identifier */
        xfs_extent_t            efi_extents[1]; /* array of extents to free */
 } xfs_efi_log_format_t;
+#ifndef HAVE_FORMAT32
 typedef struct xfs_efi_log_format_32 {
-        unsigned short          efi_type;       /* efi log item type */
+        __uint16_t              efi_type;       /* efi log item type */
-        unsigned short          efi_size;       /* size of this item */
+        __uint16_t              efi_size;       /* size of this item */
-        uint                    efi_nextents;   /* # extents to free */
+        __uint32_t              efi_nextents;   /* # extents to free */
        __uint64_t              efi_id;         /* efi identifier */
        xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
 } __attribute__((packed)) xfs_efi_log_format_32_t;
+#endif
 typedef struct xfs_efi_log_format_64 {
-        unsigned short          efi_type;       /* efi log item type */
+        __uint16_t              efi_type;       /* efi log item type */
-        unsigned short          efi_size;       /* size of this item */
+        __uint16_t              efi_size;       /* size of this item */
-        uint                    efi_nextents;   /* # extents to free */
+        __uint32_t              efi_nextents;   /* # extents to free */
        __uint64_t              efi_id;         /* efi identifier */
        xfs_extent_64_t         efi_extents[1]; /* array of extents to free */
 } xfs_efi_log_format_64_t;
@@ -79,25 +83,27 @@ typedef struct xfs_efi_log_format_64 {
 * size is given by efd_nextents;
 */
 typedef struct xfs_efd_log_format {
-        unsigned short          efd_type;       /* efd log item type */
+        __uint16_t              efd_type;       /* efd log item type */
-        unsigned short          efd_size;       /* size of this item */
+        __uint16_t              efd_size;       /* size of this item */
-        uint                    efd_nextents;   /* # of extents freed */
+        __uint32_t              efd_nextents;   /* # of extents freed */
        __uint64_t              efd_efi_id;     /* id of corresponding efi */
        xfs_extent_t            efd_extents[1]; /* array of extents freed */
 } xfs_efd_log_format_t;
+#ifndef HAVE_FORMAT32
 typedef struct xfs_efd_log_format_32 {
-        unsigned short          efd_type;       /* efd log item type */
+        __uint16_t              efd_type;       /* efd log item type */
-        unsigned short          efd_size;       /* size of this item */
+        __uint16_t              efd_size;       /* size of this item */
-        uint                    efd_nextents;   /* # of extents freed */
+        __uint32_t              efd_nextents;   /* # of extents freed */
        __uint64_t              efd_efi_id;     /* id of corresponding efi */
        xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
 } __attribute__((packed)) xfs_efd_log_format_32_t;
+#endif
 typedef struct xfs_efd_log_format_64 {
-        unsigned short          efd_type;       /* efd log item type */
+        __uint16_t              efd_type;       /* efd log item type */
-        unsigned short          efd_size;       /* size of this item */
+        __uint16_t              efd_size;       /* size of this item */
-        uint                    efd_nextents;   /* # of extents freed */
+        __uint32_t              efd_nextents;   /* # of extents freed */
        __uint64_t              efd_efi_id;     /* id of corresponding efi */
        xfs_extent_64_t         efd_extents[1]; /* array of extents freed */
 } xfs_efd_log_format_64_t;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 0f0ad1535951..1335449841cd 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -22,8 +22,6 @@
 * SGI's XFS filesystem's major stuff (constants, structures)
 */
-#define XFS_NAME        "xfs"
 /*
 * Direct I/O attribute record used with XFS_IOC_DIOINFO
 * d_miniosz is the min xfer size, xfer size multiple and file seek offset
@@ -426,11 +424,7 @@ typedef struct xfs_handle {
                                 - (char *) &(handle))                    \
                                 + (handle).ha_fid.xfs_fid_len)
-#define XFS_HANDLE_CMP(h1, h2)  memcmp(h1, h2, sizeof(xfs_handle_t))
+/*
-#define FSHSIZE         sizeof(fsid_t)
-/* 
 * Flags for going down operation
 */
 #define XFS_FSOP_GOING_FLAGS_DEFAULT            0x0     /* going down */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 33164a85aa9d..a446e5a115c6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -458,7 +458,7 @@ nextag:
                 */
                if (XFS_FORCED_SHUTDOWN(mp)) {
                        up_read(&mp->m_peraglock);
-                        return (xfs_buf_t *)0;
+                        return NULL;
                }
                agno++;
                if (agno >= agcount)
@@ -466,7 +466,7 @@ nextag:
                if (agno == pagno) {
                        if (flags == 0) {
                                up_read(&mp->m_peraglock);
-                                return (xfs_buf_t *)0;
+                                return NULL;
                        }
                        flags = 0;
                }
@@ -529,10 +529,10 @@ xfs_dialloc(
        int             offset;         /* index of inode in chunk */
        xfs_agino_t     pagino;         /* parent's a.g. relative inode # */
        xfs_agnumber_t  pagno;          /* parent's allocation group number */
-        xfs_inobt_rec_t rec;            /* inode allocation record */
+        xfs_inobt_rec_incore_t rec;     /* inode allocation record */
        xfs_agnumber_t  tagno;          /* testing allocation group number */
        xfs_btree_cur_t *tcur;          /* temp cursor */
-        xfs_inobt_rec_t trec;           /* temp inode allocation record */
+        xfs_inobt_rec_incore_t trec;    /* temp inode allocation record */
        if (*IO_agbp == NULL) {
@@ -945,7 +945,7 @@ xfs_difree(
        int             ilen;   /* inodes in an inode cluster */
        xfs_mount_t     *mp;    /* mount structure for filesystem */
        int             off;    /* offset of inode in inode chunk */
-        xfs_inobt_rec_t rec;    /* btree record */
+        xfs_inobt_rec_incore_t rec;     /* btree record */
        mp = tp->t_mountp;
@@ -1195,6 +1195,7 @@ xfs_dilocate(
                                        "(0x%llx)",
                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
                }
+                xfs_stack_trace();
 #endif /* DEBUG */
                return XFS_ERROR(EINVAL);
        }
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 616eeeb6953e..8cdeeaf8632b 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -568,7 +568,7 @@ xfs_inobt_insrec(
        /*
         * Make a key out of the record data to be inserted, and save it.
         */
-        key.ir_startino = recp->ir_startino; /* INT_: direct copy */
+        key.ir_startino = recp->ir_startino;
        optr = ptr = cur->bc_ptrs[level];
        /*
         * If we're off the left edge, return failure.
@@ -600,7 +600,7 @@ xfs_inobt_insrec(
        }
 #endif
        nbno = NULLAGBLOCK;
-        ncur = (xfs_btree_cur_t *)0;
+        ncur = NULL;
        /*
         * If the block is full, we can't insert the new entry until we
         * make the block un-full.
@@ -641,7 +641,7 @@ xfs_inobt_insrec(
                                                return error;
 #endif
                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
+                                        nrec.ir_startino = nkey.ir_startino;
                                } else {
                                        /*
                                         * Otherwise the insert fails.
@@ -681,7 +681,7 @@ xfs_inobt_insrec(
                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
                        return error;
 #endif
-                kp[ptr - 1] = key; /* INT_: struct copy */
+                kp[ptr - 1] = key;
                pp[ptr - 1] = cpu_to_be32(*bnop);
                numrecs++;
                block->bb_numrecs = cpu_to_be16(numrecs);
@@ -698,7 +698,7 @@ xfs_inobt_insrec(
                 * Now stuff the new record in, bump numrecs
                 * and log the new data.
                 */
-                rp[ptr - 1] = *recp; /* INT_: struct copy */
+                rp[ptr - 1] = *recp;
                numrecs++;
                block->bb_numrecs = cpu_to_be16(numrecs);
                xfs_inobt_log_recs(cur, bp, ptr, numrecs);
@@ -731,7 +731,7 @@ xfs_inobt_insrec(
         */
        *bnop = nbno;
        if (nbno != NULLAGBLOCK) {
-                *recp = nrec; /* INT_: struct copy */
+                *recp = nrec;
                *curp = ncur;
        }
        *stat = 1;
@@ -878,7 +878,7 @@ xfs_inobt_lookup(
                 */
                bp = cur->bc_bufs[level];
                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = (xfs_buf_t *)0;
+                        bp = NULL;
                if (!bp) {
                        /*
                         * Need to get a new buffer.  Read it, then
@@ -950,12 +950,12 @@ xfs_inobt_lookup(
                                        xfs_inobt_key_t *kkp;
                                        kkp = kkbase + keyno - 1;
-                                        startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
+                                        startino = be32_to_cpu(kkp->ir_startino);
                                } else {
                                        xfs_inobt_rec_t *krp;
                                        krp = krbase + keyno - 1;
-                                        startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
+                                        startino = be32_to_cpu(krp->ir_startino);
                                }
                                /*
                                 * Compute difference to get next direction.
@@ -1117,7 +1117,7 @@ xfs_inobt_lshift(
                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
                        return error;
 #endif
-                *lpp = *rpp; /* INT_: no-change copy */
+                *lpp = *rpp;
                xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
        }
        /*
@@ -1160,7 +1160,7 @@ xfs_inobt_lshift(
        } else {
                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+                key.ir_startino = rrp->ir_startino;
                rkp = &key;
        }
        /*
@@ -1297,13 +1297,13 @@ xfs_inobt_newroot(
         */
        kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
        if (be16_to_cpu(left->bb_level) > 0) {
-                kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
+                kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
-                kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
+                kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
        } else {
                rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-                INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
+                kp[0].ir_startino = rp->ir_startino;
                rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
+                kp[1].ir_startino = rp->ir_startino;
        }
        xfs_inobt_log_keys(cur, nbp, 1, 2);
        /*
@@ -1410,8 +1410,8 @@ xfs_inobt_rshift(
                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
                        return error;
 #endif
-                *rkp = *lkp; /* INT_: no change copy */
+                *rkp = *lkp;
-                *rpp = *lpp; /* INT_: no change copy */
+                *rpp = *lpp;
                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
        } else {
@@ -1420,7 +1420,7 @@ xfs_inobt_rshift(
                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
                *rrp = *lrp;
                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+                key.ir_startino = rrp->ir_startino;
                rkp = &key;
        }
        /*
@@ -1559,7 +1559,7 @@ xfs_inobt_split(
                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
+                keyp->ir_startino = rrp->ir_startino;
        }
        /*
         * Find the left block number by looking in the buffer.
@@ -1813,9 +1813,9 @@ xfs_inobt_get_rec(
         * Point to the record and extract its data.
         */
        rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        *ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
+        *ino = be32_to_cpu(rec->ir_startino);
-        *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
+        *fcnt = be32_to_cpu(rec->ir_freecount);
-        *free = INT_GET(rec->ir_free, ARCH_CONVERT);
+        *free = be64_to_cpu(rec->ir_free);
        *stat = 1;
        return 0;
 }
@@ -1930,10 +1930,10 @@ xfs_inobt_insert(
        level = 0;
        nbno = NULLAGBLOCK;
-        INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
+        nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-        INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
+        nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-        INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
+        nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-        ncur = (xfs_btree_cur_t *)0;
+        ncur = NULL;
        pcur = cur;
        /*
         * Loop going up the tree, starting at the leaf level.
@@ -1965,7 +1965,7 @@ xfs_inobt_insert(
                 */
                if (ncur) {
                        pcur = ncur;
-                        ncur = (xfs_btree_cur_t *)0;
+                        ncur = NULL;
                }
        } while (nbno != NULLAGBLOCK);
        *stat = i;
@@ -2060,9 +2060,9 @@ xfs_inobt_update(
        /*
         * Fill in the new contents and log them.
         */
-        INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
+        rp->ir_startino = cpu_to_be32(ino);
-        INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
+        rp->ir_freecount = cpu_to_be32(fcnt);
-        INT_SET(rp->ir_free, ARCH_CONVERT, free);
+        rp->ir_free = cpu_to_be64(free);
        xfs_inobt_log_recs(cur, bp, ptr, ptr);
        /*
         * Updating first record in leaf. Pass new key value up to our parent.
@@ -2070,7 +2070,7 @@ xfs_inobt_update(
        if (ptr == 1) {
                xfs_inobt_key_t key;    /* key containing [ino] */
-                INT_SET(key.ir_startino, ARCH_CONVERT, ino);
+                key.ir_startino = cpu_to_be32(ino);
                if ((error = xfs_inobt_updkey(cur, &key, 1)))
                        return error;
        }
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index ae3904cb1ee8..2c0e49893ff7 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -47,19 +47,24 @@ static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 /*
 * Data record structure
 */
-typedef struct xfs_inobt_rec
+typedef struct xfs_inobt_rec {
-{
+        __be32          ir_startino;    /* starting inode number */
+        __be32          ir_freecount;   /* count of free inodes (set bits) */
+        __be64          ir_free;        /* free inode mask */
+} xfs_inobt_rec_t;
+typedef struct xfs_inobt_rec_incore {
        xfs_agino_t     ir_startino;    /* starting inode number */
        __int32_t       ir_freecount;   /* count of free inodes (set bits) */
        xfs_inofree_t   ir_free;        /* free inode mask */
-} xfs_inobt_rec_t;
+} xfs_inobt_rec_incore_t;
 /*
 * Key structure
 */
-typedef struct xfs_inobt_key
+typedef struct xfs_inobt_key {
-{
+        __be32          ir_startino;    /* starting inode number */
-        xfs_agino_t     ir_startino;    /* starting inode number */
 } xfs_inobt_key_t;
 /* btree pointer type */
@@ -77,7 +82,7 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_INOBT_IS_FREE(rp,i)         \
                (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
 #define XFS_INOBT_IS_FREE_DISK(rp,i)    \
-                ((INT_GET((rp)->ir_free,ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0)
+                ((be64_to_cpu((rp)->ir_free) & XFS_INOBT_MASK(i)) != 0)
 #define XFS_INOBT_SET_FREE(rp,i)        ((rp)->ir_free |= XFS_INOBT_MASK(i))
 #define XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0724df7fabb7..b73d216ecaf9 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -50,7 +50,7 @@ void
 xfs_ihash_init(xfs_mount_t *mp)
 {
        __uint64_t      icount;
-        uint            i, flags = KM_SLEEP | KM_MAYFAIL;
+        uint            i;
        if (!mp->m_ihsize) {
                icount = mp->m_maxicount ? mp->m_maxicount :
@@ -61,14 +61,13 @@ xfs_ihash_init(xfs_mount_t *mp)
                                        (64 * NBPP) / sizeof(xfs_ihash_t));
        }
-        while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize *
+        mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize,
-                                                sizeof(xfs_ihash_t), flags))) {
+                                         NBPC * sizeof(xfs_ihash_t),
-                if ((mp->m_ihsize >>= 1) <= NBPP)
+                                         mp->m_ihsize * sizeof(xfs_ihash_t),
-                        flags = KM_SLEEP;
+                                         KM_SLEEP | KM_MAYFAIL | KM_LARGE);
-        }
+        mp->m_ihsize /= sizeof(xfs_ihash_t);
-        for (i = 0; i < mp->m_ihsize; i++) {
+        for (i = 0; i < mp->m_ihsize; i++)
                rwlock_init(&(mp->m_ihash[i].ih_lock));
-        }
 }
 /*
@@ -77,7 +76,7 @@ xfs_ihash_init(xfs_mount_t *mp)
 void
 xfs_ihash_free(xfs_mount_t *mp)
 {
-        kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
+        kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t));
        mp->m_ihash = NULL;
 }
@@ -95,7 +94,7 @@ xfs_chash_init(xfs_mount_t *mp)
        mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
        mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
                                                 * sizeof(xfs_chash_t),
-                                                 KM_SLEEP);
+                                                 KM_SLEEP | KM_LARGE);
        for (i = 0; i < mp->m_chsize; i++) {
                spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
        }
@@ -244,7 +243,9 @@ again:
                                XFS_STATS_INC(xs_ig_found);
+                                spin_lock(&ip->i_flags_lock);
                                ip->i_flags &= ~XFS_IRECLAIMABLE;
+                                spin_unlock(&ip->i_flags_lock);
                                version = ih->ih_version;
                                read_unlock(&ih->ih_lock);
                                xfs_ihash_promote(ih, ip, version);
@@ -290,15 +291,17 @@ again:
 finish_inode:
                        if (ip->i_d.di_mode == 0) {
-                                if (!(flags & IGET_CREATE))
+                                if (!(flags & XFS_IGET_CREATE))
                                        return ENOENT;
                                xfs_iocore_inode_reinit(ip);
                        }
-        
                        if (lock_flags != 0)
                                xfs_ilock(ip, lock_flags);
+                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_ISTALE;
+                        spin_unlock(&ip->i_flags_lock);
                        vn_trace_exit(vp, "xfs_iget.found",
                                                (inst_t *)__return_address);
@@ -320,21 +323,20 @@ finish_inode:
         * Read the disk inode attributes into a new inode structure and get
         * a new vnode for it. This should also initialize i_ino and i_mount.
         */
-        error = xfs_iread(mp, tp, ino, &ip, bno);
+        error = xfs_iread(mp, tp, ino, &ip, bno,
-        if (error) {
+                          (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
+        if (error)
                return error;
-        }
        vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
        xfs_inode_lock_init(ip, vp);
        xfs_iocore_inode_init(ip);
-        if (lock_flags != 0) {
+        if (lock_flags)
                xfs_ilock(ip, lock_flags);
-        }
-                
+        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-        if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) {
                xfs_idestroy(ip);
                return ENOENT;
        }
@@ -369,7 +371,9 @@ finish_inode:
        ih->ih_next = ip;
        ip->i_udquot = ip->i_gdquot = NULL;
        ih->ih_version++;
+        spin_lock(&ip->i_flags_lock);
        ip->i_flags |= XFS_INEW;
+        spin_unlock(&ip->i_flags_lock);
        write_unlock(&ih->ih_lock);
@@ -548,7 +552,7 @@ xfs_inode_lock_init(
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
        init_waitqueue_head(&ip->i_ipin_wait);
        atomic_set(&ip->i_pincount, 0);
-        init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
+        initnsema(&ip->i_flock, 1, "xfsfino");
 }
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f8ecff8553a..c27d7d495aa0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -854,7 +854,8 @@ xfs_iread(
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
        xfs_inode_t     **ipp,
-        xfs_daddr_t     bno)
+        xfs_daddr_t     bno,
+        uint            imap_flags)
 {
        xfs_buf_t       *bp;
        xfs_dinode_t    *dip;
@@ -866,6 +867,7 @@ xfs_iread(
        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
        ip->i_ino = ino;
        ip->i_mount = mp;
+        spin_lock_init(&ip->i_flags_lock);
        /*
         * Get pointer's to the on-disk inode and the buffer containing it.
@@ -874,7 +876,7 @@ xfs_iread(
         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
         * know that this is a new incore inode.
         */
-        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, 0);
+        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
        if (error) {
                kmem_zone_free(xfs_inode_zone, ip);
                return error;
@@ -1113,7 +1115,7 @@ xfs_ialloc(
         * to prevent others from looking at until we're done.
         */
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
-                        IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
        if (error != 0) {
                return error;
        }
@@ -2213,7 +2215,9 @@ xfs_ifree_cluster(
                        if (ip == free_ip) {
                                if (xfs_iflock_nowait(ip)) {
+                                        spin_lock(&ip->i_flags_lock);
                                        ip->i_flags |= XFS_ISTALE;
+                                        spin_unlock(&ip->i_flags_lock);
                                        if (xfs_inode_clean(ip)) {
                                                xfs_ifunlock(ip);
@@ -2227,7 +2231,9 @@ xfs_ifree_cluster(
                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
                                if (xfs_iflock_nowait(ip)) {
+                                        spin_lock(&ip->i_flags_lock);
                                        ip->i_flags |= XFS_ISTALE;
+                                        spin_unlock(&ip->i_flags_lock);
                                        if (xfs_inode_clean(ip)) {
                                                xfs_ifunlock(ip);
@@ -2257,7 +2263,9 @@ xfs_ifree_cluster(
                                AIL_LOCK(mp,s);
                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
                                AIL_UNLOCK(mp, s);
+                                spin_lock(&iip->ili_inode->i_flags_lock);
                                iip->ili_inode->i_flags |= XFS_ISTALE;
+                                spin_unlock(&iip->ili_inode->i_flags_lock);
                                pre_flushed++;
                        }
                        lip = lip->li_bio_list;
@@ -2753,19 +2761,29 @@ xfs_iunpin(
                 * call as the inode reclaim may be blocked waiting for
                 * the inode to become unpinned.
                 */
+                struct inode *inode = NULL;
+                spin_lock(&ip->i_flags_lock);
                if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
                        bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
                        /* make sync come back and flush this inode */
                        if (vp) {
-                                struct inode    *inode = vn_to_inode(vp);
+                                inode = vn_to_inode(vp);
                                if (!(inode->i_state &
-                                                (I_NEW|I_FREEING|I_CLEAR)))
+                                                (I_NEW|I_FREEING|I_CLEAR))) {
-                                        mark_inode_dirty_sync(inode);
+                                        inode = igrab(inode);
+                                        if (inode)
+                                                mark_inode_dirty_sync(inode);
+                                } else
+                                        inode = NULL;
                        }
                }
+                spin_unlock(&ip->i_flags_lock);
                wake_up(&ip->i_ipin_wait);
+                if (inode)
+                        iput(inode);
        }
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index d10b76ed1e5b..e96eb0835fe6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -267,6 +267,7 @@ typedef struct xfs_inode {
        sema_t                  i_flock;        /* inode flush lock */
        atomic_t                i_pincount;     /* inode pin count */
        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
+        spinlock_t              i_flags_lock;   /* inode i_flags lock */
 #ifdef HAVE_REFCACHE
        struct xfs_inode        **i_refcache;   /* ptr to entry in ref cache */
        struct xfs_inode        *i_release;     /* inode to unref */
@@ -389,11 +390,14 @@ typedef struct xfs_inode {
        (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID))
 /*
- * xfs_iget.c prototypes.
+ * Flags for xfs_iget()
 */
+#define XFS_IGET_CREATE         0x1
+#define XFS_IGET_BULKSTAT       0x2
-#define IGET_CREATE     1
+/*
+ * xfs_iget.c prototypes.
+ */
 void            xfs_ihash_init(struct xfs_mount *);
 void            xfs_ihash_free(struct xfs_mount *);
 void            xfs_chash_init(struct xfs_mount *);
@@ -425,7 +429,7 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **,
                          xfs_daddr_t, uint);
 int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                          xfs_inode_t **, xfs_daddr_t);
+                          xfs_inode_t **, xfs_daddr_t, uint);
 int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
                           xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f8e80d8e7237..a7a92251eb56 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -743,21 +743,6 @@ xfs_inode_item_committed(
 }
 /*
- * The transaction with the inode locked has aborted.  The inode
- * must not be dirty within the transaction (unless we're forcibly
- * shutting down).  We simply unlock just as if the transaction
- * had been cancelled.
- */
-STATIC void
-xfs_inode_item_abort(
-        xfs_inode_log_item_t    *iip)
-{
-        xfs_inode_item_unlock(iip);
-        return;
-}
-/*
 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
 * failed to get the inode flush lock but did get the inode locked SHARED.
 * Here we're trying to see if the inode buffer is incore, and if so whether it's
@@ -915,7 +900,6 @@ STATIC struct xfs_item_ops xfs_inode_item_ops = {
        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_inode_item_committed,
        .iop_push       = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
-        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_inode_item_abort,
        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
                                        xfs_inode_item_committing
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 5db6cd1b4cf3..bfe92ea17952 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -25,52 +25,54 @@
 * must be added on to the end.
 */
 typedef struct xfs_inode_log_format {
-        unsigned short          ilf_type;       /* inode log item type */
+        __uint16_t              ilf_type;       /* inode log item type */
-        unsigned short          ilf_size;       /* size of this item */
+        __uint16_t              ilf_size;       /* size of this item */
-        uint                    ilf_fields;     /* flags for fields logged */
+        __uint32_t              ilf_fields;     /* flags for fields logged */
-        ushort                  ilf_asize;      /* size of attr d/ext/root */
+        __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-        ushort                  ilf_dsize;      /* size of data/ext/root */
+        __uint16_t              ilf_dsize;      /* size of data/ext/root */
-        xfs_ino_t               ilf_ino;        /* inode number */
+        __uint64_t              ilf_ino;        /* inode number */
        union {
-                xfs_dev_t       ilfu_rdev;      /* rdev value for dev inode*/
+                __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
                uuid_t          ilfu_uuid;      /* mount point value */
        } ilf_u;
        __int64_t               ilf_blkno;      /* blkno of inode buffer */
-        int                     ilf_len;        /* len of inode buffer */
+        __int32_t               ilf_len;        /* len of inode buffer */
-        int                     ilf_boffset;    /* off of inode in buffer */
+        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } xfs_inode_log_format_t;
+#ifndef HAVE_FORMAT32
 typedef struct xfs_inode_log_format_32 {
-        unsigned short          ilf_type;       /* 16: inode log item type */
+        __uint16_t              ilf_type;       /* inode log item type */
-        unsigned short          ilf_size;       /* 16: size of this item */
+        __uint16_t              ilf_size;       /* size of this item */
-        uint                    ilf_fields;     /* 32: flags for fields logged */
+        __uint32_t              ilf_fields;     /* flags for fields logged */
-        ushort                  ilf_asize;      /* 32: size of attr d/ext/root */
+        __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-        ushort                  ilf_dsize;      /* 32: size of data/ext/root */
+        __uint16_t              ilf_dsize;      /* size of data/ext/root */
-        xfs_ino_t               ilf_ino;        /* 64: inode number */
+        __uint64_t              ilf_ino;        /* inode number */
        union {
-                xfs_dev_t       ilfu_rdev;      /* 32: rdev value for dev inode*/
+                __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-                uuid_t          ilfu_uuid;      /* 128: mount point value */
+                uuid_t          ilfu_uuid;      /* mount point value */
        } ilf_u;
-        __int64_t               ilf_blkno;      /* 64: blkno of inode buffer */
+        __int64_t               ilf_blkno;      /* blkno of inode buffer */
-        int                     ilf_len;        /* 32: len of inode buffer */
+        __int32_t               ilf_len;        /* len of inode buffer */
-        int                     ilf_boffset;    /* 32: off of inode in buffer */
+        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } __attribute__((packed)) xfs_inode_log_format_32_t;
+#endif
 typedef struct xfs_inode_log_format_64 {
-        unsigned short          ilf_type;       /* 16: inode log item type */
+        __uint16_t              ilf_type;       /* inode log item type */
-        unsigned short          ilf_size;       /* 16: size of this item */
+        __uint16_t              ilf_size;       /* size of this item */
-        uint                    ilf_fields;     /* 32: flags for fields logged */
+        __uint32_t              ilf_fields;     /* flags for fields logged */
-        ushort                  ilf_asize;      /* 32: size of attr d/ext/root */
+        __uint16_t              ilf_asize;      /* size of attr d/ext/root */
-        ushort                  ilf_dsize;      /* 32: size of data/ext/root */
+        __uint16_t              ilf_dsize;      /* size of data/ext/root */
-        __uint32_t              ilf_pad;        /* 32: pad for 64 bit boundary */
+        __uint32_t              ilf_pad;        /* pad for 64 bit boundary */
-        xfs_ino_t               ilf_ino;        /* 64: inode number */
+        __uint64_t              ilf_ino;        /* inode number */
        union {
-                xfs_dev_t       ilfu_rdev;      /* 32: rdev value for dev inode*/
+                __uint32_t      ilfu_rdev;      /* rdev value for dev inode*/
-                uuid_t          ilfu_uuid;      /* 128: mount point value */
+                uuid_t          ilfu_uuid;      /* mount point value */
        } ilf_u;
-        __int64_t               ilf_blkno;      /* 64: blkno of inode buffer */
+        __int64_t               ilf_blkno;      /* blkno of inode buffer */
-        int                     ilf_len;        /* 32: len of inode buffer */
+        __int32_t               ilf_len;        /* len of inode buffer */
-        int                     ilf_boffset;    /* 32: off of inode in buffer */
+        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } xfs_inode_log_format_64_t;
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f1949c16df15..19655124da78 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -398,6 +398,23 @@ xfs_flush_space(
        return 1;
 }
+STATIC int
+xfs_cmn_err_fsblock_zero(
+        xfs_inode_t     *ip,
+        xfs_bmbt_irec_t *imap)
+{
+        xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+                        "Access to block zero in inode %llu "
+                        "start_block: %llx start_off: %llx "
+                        "blkcnt: %llx extent-state: %x\n",
+                (unsigned long long)ip->i_ino,
+                (unsigned long long)imap->br_startblock,
+                (unsigned long long)imap->br_startoff,
+                (unsigned long long)imap->br_blockcount,
+                imap->br_state);
+        return EFSCORRUPTED;
+}
 int
 xfs_iomap_write_direct(
        xfs_inode_t     *ip,
@@ -536,23 +553,17 @@ xfs_iomap_write_direct(
         * Copy any maps to caller's array and return any error.
         */
        if (nimaps == 0) {
-                error = (ENOSPC);
+                error = ENOSPC;
+                goto error_out;
+        }
+        if (unlikely(!imap.br_startblock && !(io->io_flags & XFS_IOCORE_RT))) {
+                error = xfs_cmn_err_fsblock_zero(ip, &imap);
                goto error_out;
        }
        *ret_imap = imap;
        *nmaps = 1;
-        if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
-                cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
-                        "extent-state : %x \n",
-                        (ip->i_mount)->m_fsname,
-                        (long long)ip->i_ino,
-                        (unsigned long long)ret_imap->br_startblock,
-                        (unsigned long long)ret_imap->br_startoff,
-                        (unsigned long long)ret_imap->br_blockcount,
-                        ret_imap->br_state);
-        }
        return 0;
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -715,17 +726,8 @@ retry:
                goto retry;
        }
-        if (!(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
+        if (unlikely(!imap[0].br_startblock && !(io->io_flags & XFS_IOCORE_RT)))
-                cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
+                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
-                        "extent-state : %x \n",
-                        (ip->i_mount)->m_fsname,
-                        (long long)ip->i_ino,
-                        (unsigned long long)ret_imap->br_startblock,
-                        (unsigned long long)ret_imap->br_startoff,
-                        (unsigned long long)ret_imap->br_blockcount,
-                        ret_imap->br_state);
-        }
        *ret_imap = imap[0];
        *nmaps = 1;
@@ -853,24 +855,10 @@ xfs_iomap_write_allocate(
                 * See if we were able to allocate an extent that
                 * covers at least part of the callers request
                 */
                for (i = 0; i < nimaps; i++) {
-                        if (!(io->io_flags & XFS_IOCORE_RT)  &&
+                        if (unlikely(!imap[i].br_startblock &&
-                            !imap[i].br_startblock) {
+                                     !(io->io_flags & XFS_IOCORE_RT)))
-                                cmn_err(CE_PANIC,"Access to block zero:  "
+                                return xfs_cmn_err_fsblock_zero(ip, &imap[i]);
-                                        "fs <%s> inode: %lld "
-                                        "start_block : %llx start_off : %llx "
-                                        "blkcnt : %llx extent-state : %x \n",
-                                        (ip->i_mount)->m_fsname,
-                                        (long long)ip->i_ino,
-                                        (unsigned long long)
-                                                imap[i].br_startblock,
-                                        (unsigned long long)
-                                                imap[i].br_startoff,
-                                        (unsigned long long)
-                                                imap[i].br_blockcount,
-                                        imap[i].br_state);
-                        }
                        if ((offset_fsb >= imap[i].br_startoff) &&
                            (offset_fsb < (imap[i].br_startoff +
                                           imap[i].br_blockcount))) {
@@ -941,7 +929,7 @@ xfs_iomap_write_unwritten(
                                XFS_WRITE_LOG_COUNT);
                if (error) {
                        xfs_trans_cancel(tp, 0);
-                        goto error0;
+                        return XFS_ERROR(error);
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -967,19 +955,11 @@ xfs_iomap_write_unwritten(
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                if (error)
-                        goto error0;
+                        return XFS_ERROR(error);
-                if ( !(io->io_flags & XFS_IOCORE_RT)  && !imap.br_startblock) {
+                if (unlikely(!imap.br_startblock &&
-                        cmn_err(CE_PANIC,"Access to block zero:  fs <%s> "
+                             !(io->io_flags & XFS_IOCORE_RT)))
-                                "inode: %lld start_block : %llx start_off : "
+                        return xfs_cmn_err_fsblock_zero(ip, &imap);
-                                "%llx blkcnt : %llx extent-state : %x \n",
-                                (ip->i_mount)->m_fsname,
-                                (long long)ip->i_ino,
-                                (unsigned long long)imap.br_startblock,
-                                (unsigned long long)imap.br_startoff,
-                                (unsigned long long)imap.br_blockcount,
-                                imap.br_state);
-                }
                if ((numblks_fsb = imap.br_blockcount) == 0) {
                        /*
@@ -999,6 +979,5 @@ error_on_bmapi_transaction:
        xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
        return XFS_ERROR(error);
 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 46249e4d1fea..7775ddc0b3c6 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,6 +39,16 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
+int
+xfs_internal_inum(
+        xfs_mount_t     *mp,
+        xfs_ino_t       ino)
+{
+        return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+                (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+                 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+}
 STATIC int
 xfs_bulkstat_one_iget(
        xfs_mount_t     *mp,            /* mount point for filesystem */
@@ -52,7 +62,8 @@ xfs_bulkstat_one_iget(
        bhv_vnode_t     *vp;
        int             error;
-        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
+        error = xfs_iget(mp, NULL, ino,
+                         XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
        if (error) {
                *stat = BULKSTAT_RV_NOTHING;
                return error;
@@ -212,17 +223,12 @@ xfs_bulkstat_one(
        xfs_dinode_t    *dip;           /* dinode inode pointer */
        dip = (xfs_dinode_t *)dibuff;
+        *stat = BULKSTAT_RV_NOTHING;
-        if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+        if (!buffer || xfs_internal_inum(mp, ino))
-            (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
-             (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) {
-                *stat = BULKSTAT_RV_NOTHING;
                return XFS_ERROR(EINVAL);
-        }
+        if (ubsize < sizeof(*buf))
-        if (ubsize < sizeof(*buf)) {
-                *stat = BULKSTAT_RV_NOTHING;
                return XFS_ERROR(ENOMEM);
-        }
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
@@ -238,8 +244,7 @@ xfs_bulkstat_one(
        }
        if (copy_to_user(buffer, buf, sizeof(*buf)))  {
-                *stat = BULKSTAT_RV_NOTHING;
+                error = EFAULT;
-                error =  EFAULT;
                goto out_free;
        }
@@ -253,6 +258,46 @@ xfs_bulkstat_one(
 }
 /*
+ * Test to see whether we can use the ondisk inode directly, based
+ * on the given bulkstat flags, filling in dipp accordingly.
+ * Returns zero if the inode is dodgey.
+ */
+STATIC int
+xfs_bulkstat_use_dinode(
+        xfs_mount_t     *mp,
+        int             flags,
+        xfs_buf_t       *bp,
+        int             clustidx,
+        xfs_dinode_t    **dipp)
+{
+        xfs_dinode_t    *dip;
+        unsigned int    aformat;
+        *dipp = NULL;
+        if (!bp || (flags & BULKSTAT_FG_IGET))
+                return 1;
+        dip = (xfs_dinode_t *)
+                        xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
+        if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC ||
+            !XFS_DINODE_GOOD_VERSION(
+                        INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
+                return 0;
+        if (flags & BULKSTAT_FG_QUICK) {
+                *dipp = dip;
+                return 1;
+        }
+        /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
+        aformat = INT_GET(dip->di_core.di_aformat, ARCH_CONVERT);
+        if ((XFS_CFORK_Q(&dip->di_core) == 0) ||
+            (aformat == XFS_DINODE_FMT_LOCAL) ||
+            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+                *dipp = dip;
+                return 1;
+        }
+        return 1;
+}
+/*
 * Return stat information in bulk (by-inode) for the filesystem.
 */
 int                                     /* error status */
@@ -284,10 +329,11 @@ xfs_bulkstat(
        xfs_agino_t             gino;   /* current btree rec's start inode */
        int                     i;      /* loop index */
        int                     icount; /* count of inodes good in irbuf */
+        size_t                  irbsize; /* size of irec buffer in bytes */
        xfs_ino_t               ino;    /* inode number (filesystem) */
-        xfs_inobt_rec_t         *irbp;  /* current irec buffer pointer */
+        xfs_inobt_rec_incore_t  *irbp;  /* current irec buffer pointer */
-        xfs_inobt_rec_t         *irbuf; /* start of irec buffer */
+        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
-        xfs_inobt_rec_t         *irbufend; /* end of good irec buffer entries */
+        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
        xfs_ino_t               lastino=0; /* last inode number returned */
        int                     nbcluster; /* # of blocks in a cluster */
        int                     nicluster; /* # of inodes in a cluster */
@@ -328,13 +374,10 @@ xfs_bulkstat(
                (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
        nimask = ~(nicluster - 1);
        nbcluster = nicluster >> mp->m_sb.sb_inopblog;
-        /*
+        irbuf = kmem_zalloc_greedy(&irbsize, NBPC, NBPC * 4,
-         * Allocate a page-sized buffer for inode btree records.
+                                   KM_SLEEP | KM_MAYFAIL | KM_LARGE);
-         * We could try allocating something smaller, but for normal
+        nirbuf = irbsize / sizeof(*irbuf);
-         * calls we'll always (potentially) need the whole page.
-         */
-        irbuf = kmem_alloc(NBPC, KM_SLEEP);
-        nirbuf = NBPC / sizeof(*irbuf);
        /*
         * Loop over the allocation groups, starting from the last
         * inode returned; 0 means start of the allocation group.
@@ -358,7 +401,7 @@ xfs_bulkstat(
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
                cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
-                        (xfs_inode_t *)0, 0);
+                                                (xfs_inode_t *)0, 0);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -395,9 +438,9 @@ xfs_bulkstat(
                                                gcnt++;
                                }
                                gfree |= XFS_INOBT_MASKN(0, chunkidx);
-                                INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+                                irbp->ir_startino = gino;
-                                INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+                                irbp->ir_freecount = gcnt;
-                                INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+                                irbp->ir_free = gfree;
                                irbp++;
                                agino = gino + XFS_INODES_PER_CHUNK;
                                icount = XFS_INODES_PER_CHUNK - gcnt;
@@ -451,11 +494,27 @@ xfs_bulkstat(
                        }
                        /*
                         * If this chunk has any allocated inodes, save it.
+                         * Also start read-ahead now for this chunk.
                         */
                        if (gcnt < XFS_INODES_PER_CHUNK) {
-                                INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+                                /*
-                                INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+                                 * Loop over all clusters in the next chunk.
-                                INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+                                 * Do a readahead if there are any allocated
+                                 * inodes in that cluster.
+                                 */
+                                for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
+                                     chunkidx = 0;
+                                     chunkidx < XFS_INODES_PER_CHUNK;
+                                     chunkidx += nicluster,
+                                     agbno += nbcluster) {
+                                        if (XFS_INOBT_MASKN(chunkidx,
+                                                            nicluster) & ~gfree)
+                                                xfs_btree_reada_bufs(mp, agno,
+                                                        agbno, nbcluster);
+                                }
+                                irbp->ir_startino = gino;
+                                irbp->ir_freecount = gcnt;
+                                irbp->ir_free = gfree;
                                irbp++;
                                icount += XFS_INODES_PER_CHUNK - gcnt;
                        }
@@ -479,33 +538,11 @@ xfs_bulkstat(
                for (irbp = irbuf;
                     irbp < irbufend && ubleft >= statstruct_size; irbp++) {
                        /*
-                         * Read-ahead the next chunk's worth of inodes.
-                         */
-                        if (&irbp[1] < irbufend) {
-                                /*
-                                 * Loop over all clusters in the next chunk.
-                                 * Do a readahead if there are any allocated
-                                 * inodes in that cluster.
-                                 */
-                                for (agbno = XFS_AGINO_TO_AGBNO(mp,
-                                                        INT_GET(irbp[1].ir_startino, ARCH_CONVERT)),
-                                     chunkidx = 0;
-                                     chunkidx < XFS_INODES_PER_CHUNK;
-                                     chunkidx += nicluster,
-                                     agbno += nbcluster) {
-                                        if (XFS_INOBT_MASKN(chunkidx,
-                                                            nicluster) &
-                                            ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT)))
-                                                xfs_btree_reada_bufs(mp, agno,
-                                                        agbno, nbcluster);
-                                }
-                        }
-                        /*
                         * Now process this chunk of inodes.
                         */
-                        for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0;
+                        for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
                             ubleft > 0 &&
-                                INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK;
+                                irbp->ir_freecount < XFS_INODES_PER_CHUNK;
                             chunkidx++, clustidx++, agino++) {
                                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
                                /*
@@ -525,11 +562,12 @@ xfs_bulkstat(
                                 */
                                if ((chunkidx & (nicluster - 1)) == 0) {
                                        agbno = XFS_AGINO_TO_AGBNO(mp,
-                                                        INT_GET(irbp->ir_startino, ARCH_CONVERT)) +
+                                                        irbp->ir_startino) +
                                                ((chunkidx & nimask) >>
                                                 mp->m_sb.sb_inopblog);
-                                        if (flags & BULKSTAT_FG_QUICK) {
+                                        if (flags & (BULKSTAT_FG_QUICK |
+                                                     BULKSTAT_FG_INLINE)) {
                                                ino = XFS_AGINO_TO_INO(mp, agno,
                                                                       agino);
                                                bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -543,6 +581,7 @@ xfs_bulkstat(
                                                                      KM_SLEEP);
                                                ip->i_ino = ino;
                                                ip->i_mount = mp;
+                                                spin_lock_init(&ip->i_flags_lock);
                                                if (bp)
                                                        xfs_buf_relse(bp);
                                                error = xfs_itobp(mp, NULL, ip,
@@ -564,30 +603,34 @@ xfs_bulkstat(
                                /*
                                 * Skip if this inode is free.
                                 */
-                                if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT))
+                                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
                                        continue;
                                /*
                                 * Count used inodes as free so we can tell
                                 * when the chunk is used up.
                                 */
-                                INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1);
+                                irbp->ir_freecount++;
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                                if (flags & BULKSTAT_FG_QUICK) {
+                                if (!xfs_bulkstat_use_dinode(mp, flags, bp,
-                                        dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                                             clustidx, &dip))
-                                              (clustidx << mp->m_sb.sb_inodelog));
+                                        continue;
+                                /*
-                                        if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT)
+                                 * If we need to do an iget, cannot hold bp.
-                                                    != XFS_DINODE_MAGIC
+                                 * Drop it, until starting the next cluster.
-                                            || !XFS_DINODE_GOOD_VERSION(
+                                 */
-                                                    INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
+                                if ((flags & BULKSTAT_FG_INLINE) && !dip) {
-                                                continue;
+                                        if (bp)
+                                                xfs_buf_relse(bp);
+                                        bp = NULL;
                                }
                                /*
                                 * Get the inode and fill in a single buffer.
                                 * BULKSTAT_FG_QUICK uses dip to fill it in.
                                 * BULKSTAT_FG_IGET uses igets.
+                                 * BULKSTAT_FG_INLINE uses dip if we have an
+                                 * inline attr fork, else igets.
                                 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
                                 * This is also used to count inodes/blks, etc
                                 * in xfs_qm_quotacheck.
@@ -597,8 +640,15 @@ xfs_bulkstat(
                                                ubleft, private_data,
                                                bno, &ubused, dip, &fmterror);
                                if (fmterror == BULKSTAT_RV_NOTHING) {
-                                        if (error == ENOMEM)
+                                        if (error == EFAULT) {
+                                                ubleft = 0;
+                                                rval = error;
+                                                break;
+                                        }
+                                        else if (error == ENOMEM)
                                                ubleft = 0;
+                                        else
+                                                lastino = ino;
                                        continue;
                                }
                                if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -633,7 +683,7 @@ xfs_bulkstat(
        /*
         * Done, we're either out of filesystem or space to put the data.
         */
-        kmem_free(irbuf, NBPC);
+        kmem_free(irbuf, irbsize);
        *ubcountp = ubelem;
        if (agno >= mp->m_sb.sb_agcount) {
                /*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index be5f12e07d22..f25a28862a17 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -36,15 +36,16 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 /*
 * Values for stat return value.
 */
-#define BULKSTAT_RV_NOTHING     0
+#define BULKSTAT_RV_NOTHING     0
-#define BULKSTAT_RV_DIDONE      1
+#define BULKSTAT_RV_DIDONE      1
-#define BULKSTAT_RV_GIVEUP      2
+#define BULKSTAT_RV_GIVEUP      2
 /*
 * Values for bulkstat flag argument.
 */
-#define BULKSTAT_FG_IGET        0x1     /* Go through the buffer cache */
+#define BULKSTAT_FG_IGET        0x1     /* Go through the buffer cache */
-#define BULKSTAT_FG_QUICK       0x2     /* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_QUICK       0x2     /* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_INLINE      0x4     /* No iget if inline attrs */
 /*
 * Return stat information in bulk (by-inode) for the filesystem.
@@ -80,6 +81,11 @@ xfs_bulkstat_one(
        void                    *dibuff,
        int                     *stat);
+int
+xfs_internal_inum(
+        xfs_mount_t             *mp,
+        xfs_ino_t               ino);
 int                                     /* error status */
 xfs_inumbers(
        xfs_mount_t             *mp,    /* mount point for filesystem */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 21ac1a67e3e0..c48bf61f17bd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -617,7 +617,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                reg[0].i_len  = sizeof(magic);
                XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
-                error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0);
+                error = xfs_log_reserve(mp, 600, 1, &tic,
+                                        XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
                if (!error) {
                        /* remove inited flag */
                        ((xlog_ticket_t *)tic)->t_flags = 0;
@@ -655,8 +656,11 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                } else {
                        LOG_UNLOCK(log, s);
                }
-                if (tic)
+                if (tic) {
+                        xlog_trace_loggrant(log, tic, "unmount rec");
+                        xlog_ungrant_log_space(log, tic);
                        xlog_state_put_ticket(log, tic);
+                }
        } else {
                /*
                 * We're already in forced_shutdown mode, couldn't
@@ -1196,7 +1200,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                          kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
                iclog = *iclogp;
                iclog->hic_data = (xlog_in_core_2_t *)
-                          kmem_zalloc(iclogsize, KM_SLEEP);
+                          kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE);
                iclog->ic_prev = prev_iclog;
                prev_iclog = iclog;
@@ -2212,9 +2216,13 @@ xlog_state_do_callback(
                        iclog = iclog->ic_next;
                } while (first_iclog != iclog);
-                if (repeats && (repeats % 10) == 0) {
+                if (repeats > 5000) {
+                        flushcnt += repeats;
+                        repeats = 0;
                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
-                                "xlog_state_do_callback: looping %d", repeats);
+                                "%s: possible infinite loop (%d iterations)",
+                                __FUNCTION__, flushcnt);
                }
        } while (!ioerrors && loopdidcallbacks);
@@ -2246,6 +2254,7 @@ xlog_state_do_callback(
        }
 #endif
+        flushcnt = 0;
        if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
                flushcnt = log->l_flushcnt;
                log->l_flushcnt = 0;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index eacb3d4987f2..ebbe93f4f97b 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -48,16 +48,10 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 */
 /*
- * Flags to xfs_log_mount
- */
-#define XFS_LOG_RECOVER         0x1
-/*
 * Flags to xfs_log_done()
 */
 #define XFS_LOG_REL_PERM_RESERV 0x1
 /*
 * Flags to xfs_log_reserve()
 *
@@ -70,8 +64,6 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XFS_LOG_SLEEP           0x0
 #define XFS_LOG_NOSLEEP         0x1
 #define XFS_LOG_PERM_RESERV     0x2
-#define XFS_LOG_RESV_ALL        (XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV)
 /*
 * Flags to xfs_log_force()
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 34bcbf50789c..9bd3cdf11a87 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -32,7 +32,6 @@ struct xfs_mount;
 #define XLOG_MIN_ICLOGS         2
 #define XLOG_MED_ICLOGS         4
 #define XLOG_MAX_ICLOGS         8
-#define XLOG_CALLBACK_SIZE      10
 #define XLOG_HEADER_MAGIC_NUM   0xFEEDbabe      /* Invalid cycle number */
 #define XLOG_VERSION_1          1
 #define XLOG_VERSION_2          2               /* Large IClogs, Log sunit */
@@ -149,9 +148,6 @@ struct xfs_mount;
 #define XLOG_WAS_CONT_TRANS     0x08    /* Cont this trans into new region */
 #define XLOG_END_TRANS          0x10    /* End a continued transaction */
 #define XLOG_UNMOUNT_TRANS      0x20    /* Unmount a filesystem transaction */
-#define XLOG_SKIP_TRANS         (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \
-                                 XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \
-                                 XLOG_UNMOUNT_TRANS)
 #ifdef __KERNEL__
 /*
@@ -506,6 +502,12 @@ extern int	 xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 #define XLOG_TRACE_SLEEP_FLUSH 3
 #define XLOG_TRACE_WAKE_FLUSH  4
+/*
+ * Unmount record type is used as a pseudo transaction type for the ticket.
+ * It's value must be outside the range of XFS_TRANS_* values.
+ */
+#define XLOG_UNMOUNT_REC_TYPE   (-1U)
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b2bd4be4200a..e5f396ff9a3d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -331,7 +331,7 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        lock_t                  m_agirotor_lock;/* .. and lock protecting it */
        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
-        uint                    m_ihsize;       /* size of next field */
+        size_t                  m_ihsize;       /* size of next field */
        struct xfs_ihash        *m_ihash;       /* fs private inode hash table*/
        struct xfs_inode        *m_inodes;      /* active inode list */
        struct list_head        m_del_inodes;   /* inodes to reclaim */
@@ -541,7 +541,8 @@ static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp)
 #define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
 static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs)
 {
-        return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops));
+        return XFS_BHVTOM(bhv_lookup_range(VFS_BHVHEAD(vfs),
+                                VFS_POSITION_XFS, VFS_POSITION_XFS));
 }
 #define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index acb853b33ebb..9dcb32aa4e2e 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -281,8 +281,6 @@ typedef struct xfs_qoff_logformat {
                                 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
                                 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
                                 XFS_GQUOTA_ACCT)
-#define XFS_MOUNT_QUOTA_MASK    (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \
-                                 XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
 /*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5a0b678956e0..880c73271c05 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1948,7 +1948,7 @@ xfs_growfs_rt(
         */
        nrextents = nrblocks;
        do_div(nrextents, in->extsize);
-        nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize);
+        nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
        nrextslog = xfs_highbit32(nrextents);
        nrsumlevels = nrextslog + 1;
        nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
@@ -1976,7 +1976,10 @@ xfs_growfs_rt(
        if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
                        mp->m_sb.sb_rsumino)))
                return error;
-        nmp = NULL;
+        /*
+         * Allocate a new (fake) mount/sb.
+         */
+        nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
        /*
         * Loop over the bitmap blocks.
         * We will do everything one bitmap block at a time.
@@ -1987,10 +1990,6 @@ xfs_growfs_rt(
                     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
             bmbno < nrbmblocks;
             bmbno++) {
-                /*
-                 * Allocate a new (fake) mount/sb.
-                 */
-                nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
                *nmp = *mp;
                nsbp = &nmp->m_sb;
                /*
@@ -2018,13 +2017,13 @@ xfs_growfs_rt(
                cancelflags = 0;
                if ((error = xfs_trans_reserve(tp, 0,
                                XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
-                        goto error_exit;
+                        break;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        goto error_exit;
+                        break;
                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
@@ -2038,7 +2037,7 @@ xfs_growfs_rt(
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        goto error_exit;
+                        break;
                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
@@ -2053,7 +2052,7 @@ xfs_growfs_rt(
                    mp->m_rsumlevels != nmp->m_rsumlevels) {
                        error = xfs_rtcopy_summary(mp, nmp, tp);
                        if (error)
-                                goto error_exit;
+                                break;
                }
                /*
                 * Update superblock fields.
@@ -2080,18 +2079,13 @@ xfs_growfs_rt(
                error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
                if (error)
-                        goto error_exit;
+                        break;
                /*
                 * Mark more blocks free in the superblock.
                 */
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
                        nsbp->sb_rextents - sbp->sb_rextents);
                /*
-                 * Free the fake mp structure.
-                 */
-                kmem_free(nmp, sizeof(*nmp));
-                nmp = NULL;
-                /*
                 * Update mp values into the real mp structure.
                 */
                mp->m_rsumlevels = nrsumlevels;
@@ -2101,15 +2095,15 @@ xfs_growfs_rt(
                 */
                xfs_trans_commit(tp, 0, NULL);
        }
-        return 0;
+        if (error)
+                xfs_trans_cancel(tp, cancelflags);
        /*
-         * Error paths come here.
+         * Free the fake mp structure.
         */
-error_exit:
+        kmem_free(nmp, sizeof(*nmp));
-        if (nmp)
-                kmem_free(nmp, sizeof(*nmp));
-        xfs_trans_cancel(tp, cancelflags);
        return error;
 }
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index bf168a91ddb8..467854b45c8f 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -60,10 +60,6 @@ struct xfs_mount;
         XFS_SB_VERSION_LOGV2BIT | \
         XFS_SB_VERSION_SECTORBIT | \
         XFS_SB_VERSION_MOREBITSBIT)
-#define XFS_SB_VERSION_OKSASHBITS       \
-        (XFS_SB_VERSION_NUMBITS | \
-         XFS_SB_VERSION_REALFBITS | \
-         XFS_SB_VERSION_OKSASHFBITS)
 #define XFS_SB_VERSION_OKREALBITS       \
        (XFS_SB_VERSION_NUMBITS | \
         XFS_SB_VERSION_OKREALFBITS | \
@@ -81,9 +77,6 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED2BIT    0x00000002
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
-#define XFS_SB_VERSION2_SASHFBITS       0xff000000      /* Mask: features that
-                                                           require changing
-                                                           PROM and SASH */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_ATTR2BIT)
@@ -238,12 +231,6 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 }
 #endif /* __KERNEL__ */
-#define XFS_SB_GOOD_SASH_VERSION(sbp)   \
-        ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
-          ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
-         ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-          !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS)))
 #define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v)
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
@@ -461,15 +448,6 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 * File system sector to basic block conversions.
 */
 #define XFS_FSS_TO_BB(mp,sec)   ((sec) << (mp)->m_sectbb_log)
-#define XFS_BB_TO_FSS(mp,bb)    \
-        (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log)
-#define XFS_BB_TO_FSST(mp,bb)   ((bb) >> (mp)->m_sectbb_log)
-/*
- * File system sector to byte conversions.
- */
-#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog)
-#define XFS_B_TO_FSST(mp,b)     (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog)
 /*
 * File system block to basic block conversions.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9dc88b380608..c68e00105d23 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -149,7 +149,6 @@ typedef struct xfs_item_ops {
        void (*iop_unlock)(xfs_log_item_t *);
        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
        void (*iop_push)(xfs_log_item_t *);
-        void (*iop_abort)(xfs_log_item_t *);
        void (*iop_pushbuf)(xfs_log_item_t *);
        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
@@ -163,7 +162,6 @@ typedef struct xfs_item_ops {
 #define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
 #define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
 #define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
-#define IOP_ABORT(ip)           (*(ip)->li_ops->iop_abort)(ip)
 #define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
 #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 558c87ff0c41..fc39b166d403 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -276,7 +276,7 @@ xfs_trans_update_ail(
        xfs_mount_t     *mp,
        xfs_log_item_t  *lip,
        xfs_lsn_t       lsn,
-        unsigned long   s)
+        unsigned long   s) __releases(mp->m_ail_lock)
 {
        xfs_ail_entry_t         *ailp;
        xfs_log_item_t          *dlip=NULL;
@@ -328,7 +328,7 @@ void
 xfs_trans_delete_ail(
        xfs_mount_t     *mp,
        xfs_log_item_t  *lip,
-        unsigned long   s)
+        unsigned long   s) __releases(mp->m_ail_lock)
 {
        xfs_ail_entry_t         *ailp;
        xfs_log_item_t          *dlip;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 13edab8a9e94..447ac4308c91 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -46,11 +46,13 @@ xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
 /*
 * From xfs_trans_ail.c
 */
-void                    xfs_trans_update_ail(struct xfs_mount *,
+void                    xfs_trans_update_ail(struct xfs_mount *mp,
-                                     struct xfs_log_item *, xfs_lsn_t,
+                                     struct xfs_log_item *lip, xfs_lsn_t lsn,
-                                     unsigned long);
+                                     unsigned long s)
-void                    xfs_trans_delete_ail(struct xfs_mount *,
+                                     __releases(mp->m_ail_lock);
-                                     struct xfs_log_item *, unsigned long);
+void                    xfs_trans_delete_ail(struct xfs_mount *mp,
+                                     struct xfs_log_item *lip, unsigned long s)
+                                     __releases(mp->m_ail_lock);
 struct xfs_log_item     *xfs_trans_first_ail(struct xfs_mount *, int *);
 struct xfs_log_item     *xfs_trans_next_ail(struct xfs_mount *,
                                     struct xfs_log_item *, int *, int *);
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index a34796e57afb..62336a4cc5a4 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1922,7 +1922,7 @@ xfs_showargs(
        }
        if (mp->m_flags & XFS_MOUNT_IHASHSIZE)
-                seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize);
+                seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", (int)mp->m_ihsize);
        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
                seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 23cfa5837728..061e2ffdd1de 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2366,10 +2366,15 @@ xfs_remove(
        namelen = VNAMELEN(dentry);
+        if (!xfs_get_dir_entry(dentry, &ip)) {
+                dm_di_mode = ip->i_d.di_mode;
+                IRELE(ip);
+        }
        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-                                        name, NULL, 0, 0, 0);
+                                        name, NULL, dm_di_mode, 0, 0);
                if (error)
                        return error;
        }
@@ -2995,7 +3000,7 @@ xfs_rmdir(
        int                     cancel_flags;
        int                     committed;
        bhv_vnode_t             *dir_vp;
-        int                     dm_di_mode = 0;
+        int                     dm_di_mode = S_IFDIR;
        int                     last_cdp_link;
        int                     namelen;
        uint                    resblks;
@@ -3010,11 +3015,16 @@ xfs_rmdir(
                return XFS_ERROR(EIO);
        namelen = VNAMELEN(dentry);
+        if (!xfs_get_dir_entry(dentry, &cdp)) {
+                dm_di_mode = cdp->i_d.di_mode;
+                IRELE(cdp);
+        }
        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
                                        dir_vp, DM_RIGHT_NULL,
                                        NULL, DM_RIGHT_NULL,
-                                        name, NULL, 0, 0, 0);
+                                        name, NULL, dm_di_mode, 0, 0);
                if (error)
                        return XFS_ERROR(error);
        }
@@ -3834,7 +3844,9 @@ xfs_reclaim(
                XFS_MOUNT_ILOCK(mp);
                vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
+                spin_lock(&ip->i_flags_lock);
                ip->i_flags |= XFS_IRECLAIMABLE;
+                spin_unlock(&ip->i_flags_lock);
                XFS_MOUNT_IUNLOCK(mp);
        }
        return 0;
@@ -3859,8 +3871,10 @@ xfs_finish_reclaim(
         * us.
         */
        write_lock(&ih->ih_lock);
+        spin_lock(&ip->i_flags_lock);
        if ((ip->i_flags & XFS_IRECLAIM) ||
            (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
+                spin_unlock(&ip->i_flags_lock);
                write_unlock(&ih->ih_lock);
                if (locked) {
                        xfs_ifunlock(ip);
@@ -3869,6 +3883,7 @@ xfs_finish_reclaim(
                return 1;
        }
        ip->i_flags |= XFS_IRECLAIM;
+        spin_unlock(&ip->i_flags_lock);
        write_unlock(&ih->ih_lock);
        /*
@@ -4272,7 +4287,7 @@ xfs_free_file_space(
        xfs_mount_t             *mp;
        int                     nimap;
        uint                    resblks;
-        int                     rounding;
+        uint                    rounding;
        int                     rt;
        xfs_fileoff_t           startoffset_fsb;
        xfs_trans_t             *tp;
@@ -4313,8 +4328,7 @@ xfs_free_file_space(
                vn_iowait(vp);  /* wait for the completion of any pending DIOs */
        }
-        rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
+        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
-                        (__uint8_t)NBPP);
        ilen = len + (offset & (rounding - 1));
        ioffset = offset & ~(rounding - 1);
        if (ilen & (rounding - 1))